# Initial Setup

In [2]:
import pyspark 
import pandas as pd
import numpy as np
from pyspark.ml.recommendation import ALSModel, ALS
from keras.models import Sequential
from keras.layers import Dense 
from keras.optimizers import Adam 
from sklearn.preprocessing import OneHotEncoder, StandardScaler

  return f(*args, **kwds)
  return f(*args, **kwds)
Using TensorFlow backend.
  return f(*args, **kwds)
  return f(*args, **kwds)


In [3]:
spark  = pyspark.sql.SparkSession.builder.getOrCreate()
sc = spark.sparkContext

## All Dataframes

### Ratings

In [4]:
ratings = spark.read.json('data/ratings.json')
ratings.persist()

ratings_df = ratings.toPandas()
ratings_df.head()

Unnamed: 0,movie_id,rating,timestamp,user_id
0,858,4,956678732.0,6040
1,2384,4,956678754.0,6040
2,593,5,956678754.0,6040
3,1961,4,956678777.0,6040
4,1419,3,956678856.0,6040


### Movies

In [5]:
movies = pd.read_csv('data/movies.dat', sep='::', engine='python', header=None)
movies.head()

Unnamed: 0,0,1,2
0,1,Toy Story (1995),Animation|Children's|Comedy
1,2,Jumanji (1995),Adventure|Children's|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama
4,5,Father of the Bride Part II (1995),Comedy


### Users

In [6]:
users = pd.read_csv('data/users.dat', sep='::', engine='python', header=None)
users = users.rename({0:'user_id', 
              1:'gender', 
              2:'min_age', 
              3:'occupation', 
              4:'zipcode'}, 
             axis=1)
users.head()

Unnamed: 0,user_id,gender,min_age,occupation,zipcode
0,1,F,1,10,48067
1,2,M,56,16,70072
2,3,M,25,15,55117
3,4,M,45,7,2460
4,5,M,25,20,55455


In [7]:
# To see user age ranges
users.min_age.value_counts()

25    2096
35    1193
18    1103
45     550
50     496
56     380
1      222
Name: min_age, dtype: int64

### Requests (to predict)

In [65]:
requests = spark.read.json('data/requests.json')
requests.persist()
requests.show(5)
requests_df=requests.toPandas()

+--------+------+------------+-------+
|movie_id|rating|   timestamp|user_id|
+--------+------+------------+-------+
|    2019|   NaN|9.56678777E8|   6040|
|     759|   NaN|9.56679248E8|   6040|
|    2858|   NaN|9.56679275E8|   6040|
|     246|   NaN|9.56679413E8|   6040|
|    1617|   NaN|9.56679473E8|   6040|
+--------+------+------------+-------+
only showing top 5 rows



# ALS Model

In [9]:
als = ALS(
    rank=11,
    userCol='user_id',
    itemCol='movie_id',
    ratingCol='rating'
)

In [10]:
als_model = als.fit(ratings)
preds = als_model.transform(ratings)            # Known ratings
request_preds = als_model.transform(requests)   # Unknown ratings

nan_df = request_preds.toPandas()  # Fill prediction column with predicted ratings for users we have ratings from.
nan_df.head()                      # Cold start users have a predicted rating of NaN.

Unnamed: 0,movie_id,rating,timestamp,user_id,prediction
0,148,,977959026.0,53,
1,148,,976559602.0,4169,3.098435
2,148,,989024856.0,5333,2.433347
3,148,,977005381.0,4387,2.16267
4,148,,966907208.0,3539,2.820822


Since our ALS model can predict a rating for users who have rated movies in the past, we ignore these users and focus only on 'Cold Start Users', or users who have no prior movie rating in our database.

In [11]:
nan_df = nan_df[nan_df['prediction'].isnull()]  
nan_df.head()

Unnamed: 0,movie_id,rating,timestamp,user_id,prediction
0,148,,977959000.0,53,
6,148,,976841600.0,216,
7,148,,976191200.0,482,
9,148,,1026978000.0,424,
14,463,,978242800.0,26,


# Data Cleaning

### Movie Meta Data

In [12]:
meta_df = pd.read_csv('data/movies_metadata.csv')

  interactivity=interactivity, compiler=compiler, result=result)


There are some rows that have incorrectly formatted ids. Below we locate them and remove them from the data.

In [13]:
meta_df[meta_df.id.str.contains('-')==True]
#drop things that got shifted 

Unnamed: 0,adult,belongs_to_collection,budget,genres,homepage,id,imdb_id,original_language,original_title,overview,...,release_date,revenue,runtime,spoken_languages,status,tagline,title,video,vote_average,vote_count
19730,- Written by Ørnås,0.065736,/ff9qCepilowshEtG2GYWwzt2bs4.jpg,"[{'name': 'Carousel Productions', 'id': 11176}...","[{'iso_3166_1': 'CA', 'name': 'Canada'}, {'iso...",1997-08-20,0,104.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,...,1,,,,,,,,,
29503,Rune Balot goes to a casino connected to the ...,1.931659,/zV8bHuSL6WXoD6FWogP9j4x80bL.jpg,"[{'name': 'Aniplex', 'id': 2883}, {'name': 'Go...","[{'iso_3166_1': 'US', 'name': 'United States o...",2012-09-29,0,68.0,"[{'iso_639_1': 'ja', 'name': '日本語'}]",Released,...,12,,,,,,,,,
35587,Avalanche Sharks tells the story of a bikini ...,2.185485,/zaSf5OG7V8X8gqFvly88zDdRm46.jpg,"[{'name': 'Odyssey Media', 'id': 17161}, {'nam...","[{'iso_3166_1': 'CA', 'name': 'Canada'}]",2014-01-01,0,82.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,...,22,,,,,,,,,


In [14]:
bad_ids = ['1997-08-20', '2012-09-29', '2014-01-01'] 

In [15]:
meta_df = meta_df[~meta_df['id'].isin(bad_ids)]

In [16]:
meta_df['id'] = meta_df['id'].astype(int)  # Set all values in the id column to an integer type.

### Combining DataFrames

#### Merging ratings_df / meta_df / users:

In [17]:
all_training_data_df = ratings_df.merge(meta_df, how='left', left_on='movie_id', right_on='id')
all_training_data_df = all_training_data_df.merge(users, how='left', left_on='user_id', right_on='user_id')
all_training_data_df.head().T

Unnamed: 0,0,1,2,3,4
movie_id,858,2384,593,1961,1419
rating,4,4,5,4,3
timestamp,9.56679e+08,9.56679e+08,9.56679e+08,9.56679e+08,9.56679e+08
user_id,6040,6040,6040,6040,6040
adult,False,,False,False,False
belongs_to_collection,,,,,
budget,21000000,,0,1500000,0
genres,"[{'id': 35, 'name': 'Comedy'}, {'id': 18, 'nam...",,"[{'id': 18, 'name': 'Drama'}, {'id': 878, 'nam...","[{'id': 35, 'name': 'Comedy'}, {'id': 27, 'nam...","[{'id': 35, 'name': 'Comedy'}, {'id': 18, 'nam..."
homepage,,,,http://www.dhentertainment.com/projects/1.my-n...,http://www.vollidiot-derfilm.de/
id,858,,593,1961,1419


#### Merging the nan_df with meta_df and users:

In [18]:
all_data_df = nan_df.merge(meta_df, how='left', left_on='movie_id', right_on='id')
all_data_df = all_data_df.merge(users, how='left', left_on='user_id', right_on='user_id')
all_data_df.head(2)

Unnamed: 0,movie_id,rating,timestamp,user_id,prediction,adult,belongs_to_collection,budget,genres,homepage,...,status,tagline,title,video,vote_average,vote_count,gender,min_age,occupation,zipcode
0,148,,977959026.0,53,,False,,5000000,"[{'id': 18, 'name': 'Drama'}, {'id': 10749, 'n...",,...,Released,,The Secret Life of Words,False,6.8,52.0,M,25,0,96931
1,148,,976841639.0,216,,False,,5000000,"[{'id': 18, 'name': 'Drama'}, {'id': 10749, 'n...",,...,Released,,The Secret Life of Words,False,6.8,52.0,M,45,13,52761


# Testing Data

In [19]:
X = all_data_df.filter(['occupation','min_age','gender','vote_count', 'vote_average', 
                'runtime', 'revenue', 'release_date', 'popularity', 
               'budget', 'adult', 'user_id', 'movie_id'], axis=1)

In [20]:
y = all_data_df.filter(['prediction'], axis=1)

#### Data Cleaning

In [21]:
# Converting the gender feature to a 1 (F) or 0 (M).
gender_dict = {'M':0, 'F':1}
X['gender'] = X['gender'].replace(gender_dict)

In [22]:
# Converting adult videos to a boolean. 
adult_dict = {'True':True, 'False':False}

X['adult'] = X['adult'].replace(adult_dict)
X['adult'] = X['adult'].astype(bool)

In [23]:
# Converting budget column to a float.
X['budget'] = X['budget'].astype(float)

In [24]:
# Converting release date to an integer.
X['release_date'] = pd.DatetimeIndex(X['release_date']).astype(np.int64)

In [25]:
# Converting popularity to a float.
X['popularity'] = X['popularity'].astype(float)

#### One Hot Encoding Occupation Column

In [26]:
#one-hot encode occupation
enc_cols = X['occupation'].values.reshape(-1, 1)
encoder = OneHotEncoder().fit(enc_cols)
encoder.get_feature_names(['occupation'])

ohe = pd.DataFrame(encoder.transform(enc_cols).toarray(),
                   columns=encoder.get_feature_names(['occupation']))

X = pd.concat([X.drop(['occupation'], axis=1), ohe], axis=1, )

In case you used a LabelEncoder before this OneHotEncoder to convert the categories to integers, then you can now use the OneHotEncoder directly.


In [27]:
X.head().T

Unnamed: 0,0,1,2,3,4
min_age,25,45,25,25,25
gender,0,0,0,0,0
vote_count,52,52,52,52,
vote_average,6.8,6.8,6.8,6.8,
runtime,112,112,112,112,
revenue,0,0,0,0,
release_date,1134604800000000000,1134604800000000000,1134604800000000000,1134604800000000000,-9223372036854775808
popularity,12.7756,12.7756,12.7756,12.7756,
budget,5e+06,5e+06,5e+06,5e+06,
adult,False,False,False,False,True


In [28]:
# Removing 'other/not specified' occupation
X.drop(['occupation_0.0'], axis=1, inplace=True)

In [29]:
# Populating occupation column with actual names
X.rename({'occupation_1.0':'academic_educator', 
          'occupation_2.0':'artist',
         'occupation_3.0':'clerical_admin',
         'occupation_4.0':'coll_grad_student',
         'occupation_5.0':'cust_service',
         'occupation_6.0':'doctor',
         'occupation_7.0':'exec',
         'occupation_8.0':'farmer',
         'occupation_9.0':'homemaker',
         'occupation_10.0':'young_student',
         'occupation_11.0':'lawyer',
         'occupation_12.0':'programmer',
         'occupation_13.0':'retired',
         'occupation_14.0':'sales_mkting',
          'occupation_15.0':'scientist',
         'occupation_16.0':'self_employed',
         'occupation_17.0':'tech_eng',
         'occupation_18.0':'tradesman',
          'occupation_19.0':'unemployed',
          'occupation_20.0':'writer',}, axis=1, inplace=True)

In [30]:
X.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 95628 entries, 0 to 95627
Data columns (total 32 columns):
min_age              95628 non-null int64
gender               95628 non-null int64
vote_count           48000 non-null float64
vote_average         48000 non-null float64
runtime              48000 non-null float64
revenue              48000 non-null float64
release_date         95628 non-null int64
popularity           48000 non-null float64
budget               48000 non-null float64
adult                95628 non-null bool
user_id              95628 non-null int64
movie_id             95628 non-null int64
academic_educator    95628 non-null float64
artist               95628 non-null float64
clerical_admin       95628 non-null float64
coll_grad_student    95628 non-null float64
cust_service         95628 non-null float64
doctor               95628 non-null float64
exec                 95628 non-null float64
farmer               95628 non-null float64
homemaker            956

#### Making 2 Versions of X:

X_3 -- Removes columns with significant nulls     
X_2 -- Removes all rows with null values

In [31]:
#X_3 will have more rows to train the model but we'll only take the predictions of the ones we still need
X_3 = X.drop(['vote_count', 
              'vote_average', 
              'runtime', 
              'revenue', 
              'popularity', 
              'budget', 
              'adult'],
               axis=1)
X_2 = X.dropna()

In [32]:
X_2.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 48000 entries, 0 to 95491
Data columns (total 32 columns):
min_age              48000 non-null int64
gender               48000 non-null int64
vote_count           48000 non-null float64
vote_average         48000 non-null float64
runtime              48000 non-null float64
revenue              48000 non-null float64
release_date         48000 non-null int64
popularity           48000 non-null float64
budget               48000 non-null float64
adult                48000 non-null bool
user_id              48000 non-null int64
movie_id             48000 non-null int64
academic_educator    48000 non-null float64
artist               48000 non-null float64
clerical_admin       48000 non-null float64
coll_grad_student    48000 non-null float64
cust_service         48000 non-null float64
doctor               48000 non-null float64
exec                 48000 non-null float64
farmer               48000 non-null float64
homemaker            480

In [33]:
X_3.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 95628 entries, 0 to 95627
Data columns (total 25 columns):
min_age              95628 non-null int64
gender               95628 non-null int64
release_date         95628 non-null int64
user_id              95628 non-null int64
movie_id             95628 non-null int64
academic_educator    95628 non-null float64
artist               95628 non-null float64
clerical_admin       95628 non-null float64
coll_grad_student    95628 non-null float64
cust_service         95628 non-null float64
doctor               95628 non-null float64
exec                 95628 non-null float64
farmer               95628 non-null float64
homemaker            95628 non-null float64
young_student        95628 non-null float64
lawyer               95628 non-null float64
programmer           95628 non-null float64
retired              95628 non-null float64
sales_mkting         95628 non-null float64
scientist            95628 non-null float64
self_employed        

In [34]:
print(f'There are {95628-48000} predictions for our third model to make.')

There are 47628 predictions for our third model to make.


# Training Data:

In [35]:
all_training_data_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 719949 entries, 0 to 719948
Data columns (total 32 columns):
movie_id                 719949 non-null int64
rating                   719949 non-null int64
timestamp                719949 non-null float64
user_id                  719949 non-null int64
adult                    361823 non-null object
belongs_to_collection    88134 non-null object
budget                   361823 non-null object
genres                   361823 non-null object
homepage                 90115 non-null object
id                       361823 non-null float64
imdb_id                  361823 non-null object
original_language        361823 non-null object
original_title           361823 non-null object
overview                 360854 non-null object
popularity               361823 non-null object
poster_path              361823 non-null object
production_companies     361823 non-null object
production_countries     361823 non-null object
release_date             361

In [36]:
X_train = all_training_data_df.filter(['occupation',
                                       'min_age',
                                       'gender',
                                       'vote_count', 
                                       'vote_average', 
                                       'runtime', 
                                       'revenue', 
                                       'release_date', 
                                       'popularity', 
                                       'budget', 
                                       'adult', 
                                       'user_id', 
                                       'movie_id', 
                                       'rating'], 
                                        axis=1)

#### Data Cleaning

In [37]:
# Converting Gender to a 1 (F) or 0 (M)
X_train['gender'] = X_train['gender'].replace(gender_dict)

In [38]:
# Converting adult to a boolean.
X_train['adult'] = X_train['adult'].replace(adult_dict)
X_train['adult'] = X_train['adult'].astype(bool)

In [39]:
# Converting budget to a float.
X_train['budget'] = X_train['budget'].astype(float)

In [40]:
# Converting release date to an integer.
X_train['release_date'] = pd.DatetimeIndex(X_train['release_date']).astype(np.int64)

In [41]:
# Converting popularity to a float.
X_train['popularity'] = X_train['popularity'].astype(float)

#### One Hot Encoding Occupation Column

In [42]:
#one-hot encode occupation
enc_cols = X_train['occupation'].values.reshape(-1, 1)
encoder = OneHotEncoder().fit(enc_cols)
encoder.get_feature_names(['occupation'])

ohe = pd.DataFrame(encoder.transform(enc_cols).toarray(),
                   columns=encoder.get_feature_names(['occupation']))

X_train = pd.concat([X_train.drop(['occupation'], axis=1), ohe], axis=1, )

In case you used a LabelEncoder before this OneHotEncoder to convert the categories to integers, then you can now use the OneHotEncoder directly.


In [43]:
X_train.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 719949 entries, 0 to 719948
Data columns (total 34 columns):
min_age            719949 non-null int64
gender             719949 non-null int64
vote_count         361823 non-null float64
vote_average       361823 non-null float64
runtime            361823 non-null float64
revenue            361823 non-null float64
release_date       719949 non-null int64
popularity         361823 non-null float64
budget             361823 non-null float64
adult              719949 non-null bool
user_id            719949 non-null int64
movie_id           719949 non-null int64
rating             719949 non-null int64
occupation_0.0     719949 non-null float64
occupation_1.0     719949 non-null float64
occupation_2.0     719949 non-null float64
occupation_3.0     719949 non-null float64
occupation_4.0     719949 non-null float64
occupation_5.0     719949 non-null float64
occupation_6.0     719949 non-null float64
occupation_7.0     719949 non-null float64
o

In [44]:
# Removing 'other/not specified' occupation
X_train.drop(['occupation_0.0'], axis=1, inplace=True)

In [45]:
# Populating occupation columns with actual names
X_train.rename({'occupation_1.0':'academic_educator', 
                'occupation_2.0':'artist',
                'occupation_3.0':'clerical_admin',
                'occupation_4.0':'coll_grad_student',
                'occupation_5.0':'cust_service',
                'occupation_6.0':'doctor',
                'occupation_7.0':'exec',
                'occupation_8.0':'farmer',
                'occupation_9.0':'homemaker',
                'occupation_10.0':'young_student',
                'occupation_11.0':'lawyer',
                'occupation_12.0':'programmer',
                'occupation_13.0':'retired',
                'occupation_14.0':'sales_mkting',
                'occupation_15.0':'scientist',
                'occupation_16.0':'self_employed',
                'occupation_17.0':'tech_eng',
                'occupation_18.0':'tradesman',
                'occupation_19.0':'unemployed',
                'occupation_20.0':'writer',}, 
                 axis=1, 
                 inplace=True)

In [46]:
X_train.rating.unique()

array([4, 5, 3, 2, 1])

#### Making 2 Versions of X:

X_3 -- Removes columns with significant nulls     
X_2 -- Removes all rows with null values

#### X3:

In [47]:
#X_3 will have more rows to train the model but we'll only take the predictions of the ones we still need
X_3_train = X_train.drop(['vote_count', 
                          'vote_average', 
                          'runtime', 
                          'revenue', 
                          'popularity', 
                          'budget', 
                          'adult', 
                          'rating'],
                           axis=1)

In [48]:
y_3_train = X_train.filter(['rating'], axis=1)

#### X2:

In [49]:
X_2_train = X_train.dropna()
y_2_train = X_2_train.filter(['rating'], axis=1)
X_2_train = X_2_train.drop(['rating'], axis=1)

In [50]:
X_2_train.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 361823 entries, 0 to 719948
Data columns (total 32 columns):
min_age              361823 non-null int64
gender               361823 non-null int64
vote_count           361823 non-null float64
vote_average         361823 non-null float64
runtime              361823 non-null float64
revenue              361823 non-null float64
release_date         361823 non-null int64
popularity           361823 non-null float64
budget               361823 non-null float64
adult                361823 non-null bool
user_id              361823 non-null int64
movie_id             361823 non-null int64
academic_educator    361823 non-null float64
artist               361823 non-null float64
clerical_admin       361823 non-null float64
coll_grad_student    361823 non-null float64
cust_service         361823 non-null float64
doctor               361823 non-null float64
exec                 361823 non-null float64
farmer               361823 non-null float64
ho

In [51]:
X_3_train.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 719949 entries, 0 to 719948
Data columns (total 25 columns):
min_age              719949 non-null int64
gender               719949 non-null int64
release_date         719949 non-null int64
user_id              719949 non-null int64
movie_id             719949 non-null int64
academic_educator    719949 non-null float64
artist               719949 non-null float64
clerical_admin       719949 non-null float64
coll_grad_student    719949 non-null float64
cust_service         719949 non-null float64
doctor               719949 non-null float64
exec                 719949 non-null float64
farmer               719949 non-null float64
homemaker            719949 non-null float64
young_student        719949 non-null float64
lawyer               719949 non-null float64
programmer           719949 non-null float64
retired              719949 non-null float64
sales_mkting         719949 non-null float64
scientist            719949 non-null float64

# Neural Network Model:

In [51]:
# ss = StandardScaler()

In [52]:
# X_2_scaled = ss.fit_transform(X_2)
# X_3_scaled = ss.fit_transform(X_3)

  return self.partial_fit(X, y)
  return self.fit(X, **fit_params).transform(X)
  return self.partial_fit(X, y)
  return self.fit(X, **fit_params).transform(X)


In [53]:
# X_2_train_scaled = ss.fit_transform(X_2_train)
# X_3_train_scaled = ss.fit_transform(X_3_train)

  return self.partial_fit(X, y)
  return self.fit(X, **fit_params).transform(X)
  return self.partial_fit(X, y)
  return self.fit(X, **fit_params).transform(X)


In [54]:
# # from keras.utils import to_categorical
# model = Sequential()
# inputs = X_2_scaled.shape[1]
# hiddens = inputs
# model.add(Dense(hiddens, input_dim=inputs, activation='relu'))
# model.add(Dense())
# adam=Adam()

Instructions for updating:
Colocations handled automatically by placer.


In [55]:
# y_2_train = to_categorical(y_2_train) 

# y_2_train

# y_3_train = to_categorical(y_3_train)

# y_3_train

# model.compile(loss='mean_squared_error', optimizer = 'adam', metrics=['acc'])

# history_y_2 = model.fit(X_2_train_scaled, y_2_train, epochs=5)

In [61]:
# X_2_predictions = model.predict_proba(X_2_train)

array([[2.5865453e+17]], dtype=float32)

## Trying XGBoost

In [76]:
# import xgboost as xgb
# np.random.seed(0)
# import matplotlib.pyplot as plt
# from sklearn.metrics import accuracy_score, f1_score
# from sklearn.model_selection import GridSearchCV
# %matplotlib inline

# # clf = xgb.XGBClassifier(objective = "multi:softmax" , 
# #                         num_class = 5, n_jobs=-1, n_estimators=50)
# # clf.fit(X_2_train, y_2_train)

# # X_2_train_preds = clf.predict(X_2_train)
# # X_2_preds = clf.predict(X_2)

# # X_2_training_accuracy = accuracy_score(y_2_train, X_2_train_preds)
# # X_2_training_f1 = f1_score(y_2_train, X_2_train_preds, average="weighted")
# # print("Training F1: {:.4}%".format(X_2_training_f1*100))
# # print("Training Accuracy: {:.4}%".format(X_2_training_accuracy * 100))

# clf_x_3 = xgb.XGBClassifier(objective = "multi:softmax" , max_depth=5, n_estimators=50, n_jobs=-1)
# clf_x_3.fit(X_3_train, y_3_train)

# X_3_train_preds = clf_x_3.predict(X_3_train)
# X_3_preds = clf_x_3.predict(X_3)
# X_3_training_accuracy = accuracy_score(y_3_train, X_3_train_preds)
# X_3_training_f1 = f1_score(y_3_train, X_3_train_preds, average="weighted")
# print("Training F1: {:.4}%".format(X_3_training_f1*100))
# print("Training Accuracy: {:.4}%".format(X_3_training_accuracy * 100))# 

Training F1: 27.47%
Training Accuracy: 37.28%


## GridSearch XGBoost

In [68]:
# from sklearn.model_selection import GridSearchCV
# param_grid = {
#     "learning_rate": [.1, .01] ,
#     'max_depth': [4, 5],
#     'min_child_weight': [1],
#     'n_estimators': [100]
# }
# grid_clf_x_2 = GridSearchCV(clf, param_grid, scoring='accuracy', 
#                         cv=None, n_jobs=-1 )
# grid_clf_x_2.fit(X_2_train, y_2_train)

In [69]:
# best_parameters = grid_clf_x_2.best_params_

# print("Grid Search found the following optimal parameters: ")
# for param_name in sorted(best_parameters.keys()):
#     print("%s: %r" % (param_name, best_parameters[param_name]))


In [70]:
# X_2_train_gs_preds = grid_clf_x_2.predict(X_2_train)
# X_2_gs_preds = grid_clf_x_2.predict(X_2)

In [71]:
# X_2_gstraining_accuracy = accuracy_score(y_2_train, X_2_train_gs_preds)
# X_2_gstraining_f1 = f1_score(y_2_train, X_2_train_gs_preds, average="weighted")
# print("Training F1: {:.4}%".format(X_2_gstraining_f1*100))
# print("Training Accuracy: {:.4}%".format(X_2_gstraining_accuracy * 100))

In [52]:
# from sklearn.model_selection import GridSearchCV
# param_grid_3 = {
#     "learning_rate": [.1, .01] ,
#     'max_depth': [4, 5],
#     'min_child_weight': [1],
#     'n_estimators': [100]
# }
# grid_clf_x_3 = GridSearchCV(clf_x_3, param_grid_3, scoring='accuracy', 
#                         cv=None, n_jobs=-1 )
# grid_clf_x_3.fit(X_3_train, y_3_train)
# best_parameters = grid_clf_x_3.best_params_

# print("Grid Search found the following optimal parameters: ")
# for param_name in sorted(best_parameters.keys()):
#     print("%s: %r" % (param_name, best_parameters[param_name]))
# X_3_train_gs_preds = grid_clf_x_3.predict(X_3_train)
# X_3_gs_preds = grid_clf_x_3.predict(X_3)
# X_3_gstraining_accuracy = accuracy_score(y_3_train, X_3_train_gs_preds)
# X_3_gstraining_f1 = f1_score(y_3_train, X_3_train_gs_preds, average="weighted")
# print("Training F1: {:.4}%".format(X_3_gstraining_f1*100))
# print("Training Accuracy: {:.4}%".format(X_3_gstraining_accuracy * 100))

In [53]:
from sklearn.ensemble import RandomForestClassifier 

In [54]:
rfc = RandomForestClassifier(n_estimators=10, n_jobs=-1)

In [56]:
rfc.fit(X_2_train, y_2_train)

  """Entry point for launching an IPython kernel.


RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=-1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)

In [57]:
rfc3 = RandomForestClassifier(n_estimators=10, n_jobs=-1)
rfc3.fit(X_3_train, y_3_train)

  


RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=-1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)

In [60]:
preds_X_2 = rfc.predict_proba(X_2_train)

In [61]:
preds_X_3 = rfc3.predict_proba(X_3_train)

In [68]:
requests_df = request_preds.toPandas()

In [69]:
for i, row in requests_df[requests_df['prediction'].isna()].iterrows():
    print (row)
    break

movie_id            148.0
rating                NaN
timestamp     977959026.0
user_id              53.0
prediction            NaN
Name: 0, dtype: float64


In [72]:
for i, row in requests_df.iterrows():
    if str(row['prediction']) == 'nan':
        if row['user_id'] in X_2['user_id']:
            requests_df.loc[i, 'prediction'] = rfc.predict(pd.DataFrame(X_2.loc[row['user_id'],:]).T)

KeyboardInterrupt: 

In [None]:
# for i, row in requests_df[requests_df['prediction'].isna()].iterrows():
#     if row['user_id'] in X_2['user_id']:
#         requests_df.loc[i, 'prediction'] = rfc.predict(pd.DataFrame(X_2.loc[row['user_id'],:]).T)

In [None]:
for i, row in requests_df.iterrows():
    if str(row['prediction']) == 'nan':
        if row['user_id'] in X_3['user_id']:
            requests_df.loc[i, 'prediction'] = rfc3.predict(pd.DataFrame(X_3.loc[row['user_id'],:]).T)

In [None]:
request_df.to_json('final_final_json.json')