<a href="https://colab.research.google.com/github/mdeihim/Music-Recommender/blob/main/Copy_of_Machine_Learning_Recommender.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Machine Learning Models

## First is to fetch the related scores in training data

In [None]:
import pandas as pd
#Import training data
train = pd.read_csv('output.txt', sep='|', header=None)
train.columns = ['UserID', 'ItemID', 'Rating1', 'Rating2']
train.head()

Unnamed: 0,UserID,ItemID,Rating1,Rating2
0,199810,208019,0.0,0.0
1,199810,74139,0.0,0.0
2,199810,9903,0.0,0.0
3,199810,242681,0.0,0.0
4,199810,18515,0.0,70.0


In [None]:
#Import testing data
test = pd.read_csv('test2_new.txt', sep = '|', header=None)
test.columns = ['UserID', 'ItemID', 'Predictor']
test['Rating1'] = 0.0
test['Rating2'] = 0.0
test = test[['UserID', 'ItemID', 'Rating1', 'Rating2', 'Predictor']]
test.head()

Unnamed: 0,UserID,ItemID,Rating1,Rating2,Predictor
0,200031,30877,0.0,0.0,1
1,200031,8244,0.0,0.0,1
2,200031,130183,0.0,0.0,0
3,200031,198762,0.0,0.0,0
4,200031,34503,0.0,0.0,1


In [None]:
#Find like values
for i in range(len(test)):
    user = test['UserID'][i]
    item = test['ItemID'][i]
    test['Rating1'][i] = train[train['UserID']==user][train['ItemID']==item]['Rating1']
    test['Rating2'][i] = train[train['UserID']==user][train['ItemID']==item]['Rating2']

In [None]:
test.to_csv('test.csv', index=False)
train.to_csv('train.csv', index=False)

## Second install pyspark and change to spark dataframe

In [None]:
pip install pyspark

Collecting pyspark
  Downloading pyspark-3.2.0.tar.gz (281.3 MB)
[K     |████████████████████████████████| 281.3 MB 37 kB/s 
[?25hCollecting py4j==0.10.9.2
  Downloading py4j-0.10.9.2-py2.py3-none-any.whl (198 kB)
[K     |████████████████████████████████| 198 kB 50.5 MB/s 
[?25hBuilding wheels for collected packages: pyspark
  Building wheel for pyspark (setup.py) ... [?25l[?25hdone
  Created wheel for pyspark: filename=pyspark-3.2.0-py2.py3-none-any.whl size=281805912 sha256=b3699d4283888a91d499590cc3dec6cc7a329babbd3655151241c345d923b1ff
  Stored in directory: /root/.cache/pip/wheels/0b/de/d2/9be5d59d7331c6c2a7c1b6d1a4f463ce107332b1ecd4e80718
Successfully built pyspark
Installing collected packages: py4j, pyspark
Successfully installed py4j-0.10.9.2 pyspark-3.2.0


In [None]:
from pyspark.sql import SparkSession
from pyspark import SparkContext
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.ml.recommendation import ALS
from pyspark.sql import Row


sc = SparkContext()
if __name__ == "__main__":
    spark = SparkSession\
        .builder\
        .appName("ML_Recommender")\
        .getOrCreate()

In [None]:
df = spark.read.csv('test.csv', header=True, inferSchema=True)
df.printSchema()

root
 |-- UserID: integer (nullable = true)
 |-- ItemID: integer (nullable = true)
 |-- Rating1: double (nullable = true)
 |-- Rating2: double (nullable = true)
 |-- Predictor: integer (nullable = true)



In [None]:
df.show(5)

+------+------+-------+-------+---------+
|UserID|ItemID|Rating1|Rating2|Predictor|
+------+------+-------+-------+---------+
|200031| 30877|   90.0|   50.0|        1|
|200031|  8244|   90.0|    0.0|        1|
|200031|130183|    0.0|    0.0|        0|
|200031|198762|    0.0|    0.0|        0|
|200031| 34503|   90.0|   50.0|        1|
+------+------+-------+-------+---------+
only showing top 5 rows



In [None]:
from pyspark.ml.feature import VectorAssembler
from pyspark.ml import Pipeline

assembler =VectorAssembler(inputCols=['Rating1','Rating2'], outputCol= 'features')

cols = df.columns
pipeline=Pipeline(stages=[assembler])
model=pipeline.fit(df)
df=model.transform(df)
selectedCols = ['features'] + cols
df = df.select(selectedCols)


In [None]:
train_1, test_1 = df.randomSplit([0.8, 0.2], seed=102898)
print('Training Length: '+str(train_1.count()))
print('Test Length: '+str(test_1.count()))

Training Length: 4780
Test Length: 1220


## Factorization

In [None]:
from pyspark.ml.classification import FMClassifier
from pyspark.ml.feature import MinMaxScaler, StringIndexer
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

#create data frame to train for ML recommenders
ml_df = spark.read.csv('train.csv', header=True, inferSchema=True)
ml_cols = ml_df.columns
ml_df = model.transform(ml_df)
selCols = ['features'] + ml_cols
ml_df = ml_df.select(selCols)
ml_df.printSchema()

#Create Factorization Machine Classifier
fac = FMClassifier(featuresCol = 'features', labelCol = 'Predictor', stepSize=0.005)
facmodel = fac.fit(df)
predictions_fac = facmodel.transform(ml_df)
predictions_fac.show(5)

root
 |-- features: vector (nullable = true)
 |-- UserID: integer (nullable = true)
 |-- ItemID: integer (nullable = true)
 |-- Rating1: double (nullable = true)
 |-- Rating2: double (nullable = true)

+----------+------+------+-------+-------+--------------------+--------------------+----------+
|  features|UserID|ItemID|Rating1|Rating2|       rawPrediction|         probability|prediction|
+----------+------+------+-------+-------+--------------------+--------------------+----------+
| (2,[],[])|199810|208019|    0.0|    0.0|[0.46254462957343...|[0.61361765912985...|       0.0|
| (2,[],[])|199810| 74139|    0.0|    0.0|[0.46254462957343...|[0.61361765912985...|       0.0|
| (2,[],[])|199810|  9903|    0.0|    0.0|[0.46254462957343...|[0.61361765912985...|       0.0|
| (2,[],[])|199810|242681|    0.0|    0.0|[0.46254462957343...|[0.61361765912985...|       0.0|
|[0.0,70.0]|199810| 18515|    0.0|   70.0|[-1.0439929863640...|[0.26038027962148...|       1.0|
+----------+------+------+----

In [None]:
#Switch to pandas
facResults = predictions_fac.select('UserID', 'ItemID', 'prediction')
facResults = facResults.toPandas()
facResults['TrackID'] = ''
facResults.head()

Unnamed: 0,UserID,ItemID,prediction,TrackID
0,199810,208019,0.0,
1,199810,74139,0.0,
2,199810,9903,0.0,
3,199810,242681,0.0,
4,199810,18515,1.0,


In [None]:
#format for submission
for i in range(len(facResults)):
    facResults['TrackID'][i] = str(facResults['UserID'][i])+'_'+str(facResults['ItemID'][i])

facResults = facResults[['TrackID', 'UserID', 'ItemID', 'prediction']]
facResults.drop(columns={'UserID', 'ItemID'}, inplace=True)
facResults = facResults.rename(columns={'prediction': 'Predictor'})
facResults.head()
facResults.to_csv('factorization.csv', index=False)


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  This is separate from the ipykernel package so we can avoid doing imports until


## Random Forest Classifier

In [None]:
from pyspark.ml.classification import RandomForestClassifier

# Results before cross validation to tune parameters
rf = RandomForestClassifier(featuresCol = 'features', labelCol = 'Predictor')
rfModel = rf.fit(df)
predictions_rf = rfModel.transform(ml_df)
predictions_rf.show(5)

+----------+------+------+-------+-------+--------------------+--------------------+----------+
|  features|UserID|ItemID|Rating1|Rating2|       rawPrediction|         probability|prediction|
+----------+------+------+-------+-------+--------------------+--------------------+----------+
| (2,[],[])|199810|208019|    0.0|    0.0|[16.2222608701549...|[0.81111304350774...|       0.0|
| (2,[],[])|199810| 74139|    0.0|    0.0|[16.2222608701549...|[0.81111304350774...|       0.0|
| (2,[],[])|199810|  9903|    0.0|    0.0|[16.2222608701549...|[0.81111304350774...|       0.0|
| (2,[],[])|199810|242681|    0.0|    0.0|[16.2222608701549...|[0.81111304350774...|       0.0|
|[0.0,70.0]|199810| 18515|    0.0|   70.0|[5.65634778859979...|[0.28281738942998...|       1.0|
+----------+------+------+-------+-------+--------------------+--------------------+----------+
only showing top 5 rows



In [None]:
# Cross Validation to tune hyperparameters
from pyspark.ml.tuning import ParamGridBuilder, CrossValidator
from pyspark.ml.evaluation import RegressionEvaluator
# Grid search for best params
param_grid = ParamGridBuilder() \
            .addGrid(rf.maxDepth, [3,4,5,6,7]) \
            .addGrid(rf.numTrees, [5,8,10,12,15]) \
            .build()
from pyspark.ml.evaluation import BinaryClassificationEvaluator
from pyspark.mllib.evaluation import BinaryClassificationMetrics             
# Define evaluation metric: RMSE
evaluator = BinaryClassificationEvaluator(labelCol='Predictor',rawPredictionCol="prediction")
print ("Num models to be tested: ", len(param_grid))

# Build cross validation using CrossValidator
cv = CrossValidator(estimator=rf, estimatorParamMaps=param_grid, evaluator=evaluator, numFolds=4)

#Fit cross validator to the 'train' dataset
model = cv.fit(df)

#Extract best model from the cv model above and get training predictions
best_model = model.bestModel
train_output = best_model.transform(df)
train_accuracy = evaluator.evaluate(train_output)

# Print the best parameters and rmse
print("**Best Model from Training**")
print("  MaxDepth:", best_model._java_obj.parent().getMaxDepth())
print("  NumTrees:", best_model._java_obj.parent().getNumTrees())
print("  Accuracy:", train_accuracy)

Num models to be tested:  25
**Best Model from Training**
  MaxDepth: 3
  NumTrees: 12
  Accuracy: 0.8543333333333333


In [None]:
# Get test predictions
predictions_rf = best_model.transform(ml_df)
predictions_rf.show(5)

+----------+------+------+-------+-------+--------------------+--------------------+----------+
|  features|UserID|ItemID|Rating1|Rating2|       rawPrediction|         probability|prediction|
+----------+------+------+-------+-------+--------------------+--------------------+----------+
| (2,[],[])|199810|208019|    0.0|    0.0|[9.70530455772366...|[0.80877537981030...|       0.0|
| (2,[],[])|199810| 74139|    0.0|    0.0|[9.70530455772366...|[0.80877537981030...|       0.0|
| (2,[],[])|199810|  9903|    0.0|    0.0|[9.70530455772366...|[0.80877537981030...|       0.0|
| (2,[],[])|199810|242681|    0.0|    0.0|[9.70530455772366...|[0.80877537981030...|       0.0|
|[0.0,70.0]|199810| 18515|    0.0|   70.0|[1.06470954036445...|[0.08872579503037...|       1.0|
+----------+------+------+-------+-------+--------------------+--------------------+----------+
only showing top 5 rows



In [None]:

#Switch to pandas
rfResults = predictions_rf.select('UserID', 'ItemID', 'prediction')
rfResults = rfResults.toPandas()
rfResults['TrackID'] = ''
rfResults.head()

Unnamed: 0,UserID,ItemID,prediction,TrackID
0,199810,208019,0.0,
1,199810,74139,0.0,
2,199810,9903,0.0,
3,199810,242681,0.0,
4,199810,18515,1.0,


In [None]:
#format for submission
for i in range(len(rfResults)):
    rfResults['TrackID'][i] = str(rfResults['UserID'][i])+'_'+str(rfResults['ItemID'][i])

rfResults = rfResults[['TrackID', 'UserID', 'ItemID', 'prediction']]
rfResults.drop(columns={'UserID', 'ItemID'}, inplace=True)
rfResults = rfResults.rename(columns={'prediction': 'Predictor'})
rfResults.head()
rfResults.to_csv('RandomForest.csv', index=False)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  This is separate from the ipykernel package so we can avoid doing imports until
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  errors=errors,


## Decision Tree

In [None]:
from pyspark.ml.classification import DecisionTreeClassifier

dt = DecisionTreeClassifier(featuresCol = 'features', labelCol = 'Predictor',maxDepth = 3)
dtModel = dt.fit(df)
predictions_dt = dtModel.transform(ml_df)
predictions_dt.show(5)

+----------+------+------+-------+-------+--------------+--------------------+----------+
|  features|UserID|ItemID|Rating1|Rating2| rawPrediction|         probability|prediction|
+----------+------+------+-------+-------+--------------+--------------------+----------+
| (2,[],[])|199810|208019|    0.0|    0.0|[2775.0,649.0]|[0.81045560747663...|       0.0|
| (2,[],[])|199810| 74139|    0.0|    0.0|[2775.0,649.0]|[0.81045560747663...|       0.0|
| (2,[],[])|199810|  9903|    0.0|    0.0|[2775.0,649.0]|[0.81045560747663...|       0.0|
| (2,[],[])|199810|242681|    0.0|    0.0|[2775.0,649.0]|[0.81045560747663...|       0.0|
|[0.0,70.0]|199810| 18515|    0.0|   70.0|[224.0,2252.0]|[0.09046849757673...|       1.0|
+----------+------+------+-------+-------+--------------+--------------------+----------+
only showing top 5 rows



In [None]:
# Grid search for best params
param_grid = ParamGridBuilder() \
            .addGrid(dt.maxDepth, [2,3,4,5]) \
            .addGrid(dt.maxBins, [5,6,7,8,10,12]) \
            .build()
        
# Define evaluation metric

evaluator = BinaryClassificationEvaluator(labelCol='Predictor',rawPredictionCol="prediction")
print ("Num models to be tested: ", len(param_grid))

# Build cross validation using CrossValidator
cv = CrossValidator(estimator=dt, estimatorParamMaps=param_grid, evaluator=evaluator, numFolds=4)

#Fit cross validator to the 'train' dataset
model = cv.fit(df)

#Extract best model from the cv model above and get training predictions
best_model = model.bestModel
train_output = best_model.transform(df)
train_accuracy = evaluator.evaluate(train_output)

# Print the best parameters and rmse
print("**Best Model from Training**")
print("  MaxDepth:", best_model._java_obj.parent().getMaxDepth())
print("  MaxBins:", best_model._java_obj.parent().getMaxBins())
print("  Accuracy:", train_accuracy)

Num models to be tested:  24
**Best Model from Training**
  MaxDepth: 2
  MaxBins: 5
  Accuracy: 0.854


In [None]:
# Get test predictions
predictions_dt = best_model.transform(ml_df)
predictions_dt.show(5)

+----------+------+------+-------+-------+--------------+--------------------+----------+
|  features|UserID|ItemID|Rating1|Rating2| rawPrediction|         probability|prediction|
+----------+------+------+-------+-------+--------------+--------------------+----------+
| (2,[],[])|199810|208019|    0.0|    0.0|[2771.0,647.0]|[0.81070801638385...|       0.0|
| (2,[],[])|199810| 74139|    0.0|    0.0|[2771.0,647.0]|[0.81070801638385...|       0.0|
| (2,[],[])|199810|  9903|    0.0|    0.0|[2771.0,647.0]|[0.81070801638385...|       0.0|
| (2,[],[])|199810|242681|    0.0|    0.0|[2771.0,647.0]|[0.81070801638385...|       0.0|
|[0.0,70.0]|199810| 18515|    0.0|   70.0|[229.0,2256.0]|[0.09215291750503...|       1.0|
+----------+------+------+-------+-------+--------------+--------------------+----------+
only showing top 5 rows



In [None]:
#Switch to pandas
dtResults = predictions_dt.select('UserID', 'ItemID', 'prediction')
dtResults = dtResults.toPandas()
dtResults['TrackID'] = ''
dtResults.head()

Unnamed: 0,UserID,ItemID,prediction,TrackID
0,199810,208019,0.0,
1,199810,74139,0.0,
2,199810,9903,0.0,
3,199810,242681,0.0,
4,199810,18515,1.0,


In [None]:
#format for submission
for i in range(len(dtResults)):
    dtResults['TrackID'][i] = str(dtResults['UserID'][i])+'_'+str(dtResults['ItemID'][i])

dtResults = dtResults[['TrackID', 'UserID', 'ItemID', 'prediction']]
dtResults.drop(columns={'UserID', 'ItemID'}, inplace=True)
dtResults = dtResults.rename(columns={'prediction': 'Predictor'})
dtResults.head()
dtResults.to_csv('decisionTree.csv', index=False)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  This is separate from the ipykernel package so we can avoid doing imports until
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  errors=errors,


## SVM Classifier

In [None]:
from pyspark.ml.classification import LinearSVC

lsvc = LinearSVC(labelCol = 'Predictor',maxIter=10, regParam=0.1)
lsvcModel = lsvc.fit(df)
predictions_lsvc = lsvcModel.transform(df)
predictions_lsvc.show(5)

+-----------+------+------+-------+-------+---------+--------------------+----------+
|   features|UserID|ItemID|Rating1|Rating2|Predictor|       rawPrediction|prediction|
+-----------+------+------+-------+-------+---------+--------------------+----------+
|[90.0,50.0]|200031| 30877|   90.0|   50.0|        1|[-1.1301141884347...|       1.0|
| [90.0,0.0]|200031|  8244|   90.0|    0.0|        1|[-0.1599416934524...|       1.0|
|  (2,[],[])|200031|130183|    0.0|    0.0|        0|[1.00008112737597...|       0.0|
|  (2,[],[])|200031|198762|    0.0|    0.0|        0|[1.00008112737597...|       0.0|
|[90.0,50.0]|200031| 34503|   90.0|   50.0|        1|[-1.1301141884347...|       1.0|
+-----------+------+------+-------+-------+---------+--------------------+----------+
only showing top 5 rows



In [None]:
# Grid search for best params
param_grid = ParamGridBuilder() \
            .addGrid(lsvc.regParam, [.01,.05,.1,.15,.2]) \
            .addGrid(lsvc.maxIter, [2,5,7,10]) \
            .build()

from pyspark.ml.evaluation import BinaryClassificationEvaluator
from pyspark.mllib.evaluation import BinaryClassificationMetrics             
# Define evaluation metric: RMSE
evaluator = BinaryClassificationEvaluator(labelCol='Predictor',rawPredictionCol="prediction")
print ("Num models to be tested: ", len(param_grid))

# Build cross validation using CrossValidator
cv = CrossValidator(estimator=lsvc, estimatorParamMaps=param_grid, evaluator=evaluator, numFolds=4)

#Fit cross validator to the 'train' dataset
model = cv.fit(df)

#Extract best model from the cv model above and get training predictions
best_model = model.bestModel
train_output = best_model.transform(df)
train_accuracy = evaluator.evaluate(train_output)

# Print the best parameters and rmse
print("**Best Model from Training**")
print("  MaxIter:", best_model._java_obj.parent().getMaxIter())
print("  RegParam:", best_model._java_obj.parent().getRegParam())
print("  Accuracy:", train_accuracy)

Num models to be tested:  20
**Best Model from Training**
  MaxIter: 7
  RegParam: 0.05
  Accuracy: 0.8530000000000001


In [None]:
#Do on ML dataset now
predictions_svm = best_model.transform(ml_df)
predictions_svm.show(5)

+----------+------+------+-------+-------+--------------------+----------+
|  features|UserID|ItemID|Rating1|Rating2|       rawPrediction|prediction|
+----------+------+------+-------+-------+--------------------+----------+
| (2,[],[])|199810|208019|    0.0|    0.0|[0.97015197750719...|       0.0|
| (2,[],[])|199810| 74139|    0.0|    0.0|[0.97015197750719...|       0.0|
| (2,[],[])|199810|  9903|    0.0|    0.0|[0.97015197750719...|       0.0|
| (2,[],[])|199810|242681|    0.0|    0.0|[0.97015197750719...|       0.0|
|[0.0,70.0]|199810| 18515|    0.0|   70.0|[-0.5692418334227...|       1.0|
+----------+------+------+-------+-------+--------------------+----------+
only showing top 5 rows



In [None]:
from pyspark.ml.evaluation import BinaryClassificationEvaluator

evaluator = BinaryClassificationEvaluator(labelCol='Predictor')
print("Test Area Under ROC: " + str(evaluator.evaluate(predictions_lsvc)))

Test Area Under ROC: 0.8726736111111111


In [None]:
#Switch to pandas
lsvcResults = predictions_svm.select('UserID', 'ItemID', 'prediction')
lsvcResults = lsvcResults.toPandas()
lsvcResults['TrackID'] = ''
lsvcResults.head()

Unnamed: 0,UserID,ItemID,prediction,TrackID
0,199810,208019,0.0,
1,199810,74139,0.0,
2,199810,9903,0.0,
3,199810,242681,0.0,
4,199810,18515,1.0,


In [None]:
#format for submission
for i in range(len(lsvcResults)):
    lsvcResults['TrackID'][i] = str(lsvcResults['UserID'][i])+'_'+str(lsvcResults['ItemID'][i])

lsvcResults = lsvcResults[['TrackID', 'UserID', 'ItemID', 'prediction']]
lsvcResults.drop(columns={'UserID', 'ItemID'}, inplace=True)
lsvcResults = lsvcResults.rename(columns={'prediction': 'Predictor'})
lsvcResults.head()
lsvcResults.to_csv('lsvc.csv', index=False)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  This is separate from the ipykernel package so we can avoid doing imports until
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  errors=errors,


## Logistic Regression

In [None]:
from pyspark.ml.classification import LogisticRegression

lr = LogisticRegression(featuresCol ='features', labelCol = 'Predictor', maxIter=10)
lrmodel = lr.fit(df)
predictions_lr = lrmodel.transform(df)
predictions_lr.show(5)

+-----------+------+------+-------+-------+---------+--------------------+--------------------+----------+
|   features|UserID|ItemID|Rating1|Rating2|Predictor|       rawPrediction|         probability|prediction|
+-----------+------+------+-------+-------+---------+--------------------+--------------------+----------+
|[90.0,50.0]|200031| 30877|   90.0|   50.0|        1|[-4.6445020897207...|[0.00952276070842...|       1.0|
| [90.0,0.0]|200031|  8244|   90.0|    0.0|        1|[-3.0228713032312...|[0.04640325376362...|       1.0|
|  (2,[],[])|200031|130183|    0.0|    0.0|        0|[1.42417015136285...|[0.80599133227115...|       0.0|
|  (2,[],[])|200031|198762|    0.0|    0.0|        0|[1.42417015136285...|[0.80599133227115...|       0.0|
|[90.0,50.0]|200031| 34503|   90.0|   50.0|        1|[-4.6445020897207...|[0.00952276070842...|       1.0|
+-----------+------+------+-------+-------+---------+--------------------+--------------------+----------+
only showing top 5 rows



In [None]:
evaluator = BinaryClassificationEvaluator(labelCol = 'Predictor')
print("Test Area Under ROC: " + str(evaluator.evaluate(predictions_lr)))

Test Area Under ROC: 0.8749316111111111


In [None]:
from pyspark.ml.tuning import ParamGridBuilder, CrossValidator

paramGrid = (ParamGridBuilder()
             .addGrid(lr.regParam, [0.01, 0.1, 0.3, 0.5])
             .addGrid(lr.elasticNetParam, [0.0, 0.3, 0.5, 1.0])
             .addGrid(lr.maxIter, [1, 5, 10, 15])
             .build())

cv = CrossValidator(estimator=lr, estimatorParamMaps=paramGrid, evaluator=evaluator, numFolds=5)

cvModel = cv.fit(df)
predictions_lrTuned = cvModel.transform(df)
print('Test Area Under ROC', evaluator.evaluate(predictions_lrTuned))

Test Area Under ROC 0.8747172777777777


In [None]:
#Switch to pandas
predictions_lrTuned = cvModel.transform(ml_df)
lrResults = predictions_lrTuned.select('UserID', 'ItemID', 'prediction')
lrResults = lrResults.toPandas()
lrResults['TrackID'] = ''
lrResults.head()

Unnamed: 0,UserID,ItemID,prediction,TrackID
0,199810,208019,0.0,
1,199810,74139,0.0,
2,199810,9903,0.0,
3,199810,242681,0.0,
4,199810,18515,1.0,


In [None]:
#format for submission
for i in range(len(lrResults)):
    lrResults['TrackID'][i] = str(lrResults['UserID'][i])+'_'+str(lrResults['ItemID'][i])

lrResults = lrResults[['TrackID', 'UserID', 'ItemID', 'prediction']]
lrResults.drop(columns={'UserID', 'ItemID'}, inplace=True)
lrResults = lrResults.rename(columns={'prediction': 'Predictor'})
lrResults.head()
lrResults.to_csv('logisticRegression.csv', index=False)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  This is separate from the ipykernel package so we can avoid doing imports until
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  errors=errors,


## Gradient Boosted Tree Clasifier

In [None]:
from pyspark.ml.classification import GBTClassifier

gbt = GBTClassifier(featuresCol = 'features', labelCol = 'Predictor', maxIter=10)
gbtmodel = gbt.fit(df)
predictions_gbt = gbtmodel.transform(df)
predictions_gbt.show(5)

+-----------+------+------+-------+-------+---------+--------------------+--------------------+----------+
|   features|UserID|ItemID|Rating1|Rating2|Predictor|       rawPrediction|         probability|prediction|
+-----------+------+------+-------+-------+---------+--------------------+--------------------+----------+
|[90.0,50.0]|200031| 30877|   90.0|   50.0|        1|[-1.3258100058733...|[0.06588923111810...|       1.0|
| [90.0,0.0]|200031|  8244|   90.0|    0.0|        1|[-1.3258100058733...|[0.06588923111810...|       1.0|
|  (2,[],[])|200031|130183|    0.0|    0.0|        0|[0.69621496476798...|[0.80097988422856...|       0.0|
|  (2,[],[])|200031|198762|    0.0|    0.0|        0|[0.69621496476798...|[0.80097988422856...|       0.0|
|[90.0,50.0]|200031| 34503|   90.0|   50.0|        1|[-1.3258100058733...|[0.06588923111810...|       1.0|
+-----------+------+------+-------+-------+---------+--------------------+--------------------+----------+
only showing top 5 rows



In [None]:
# Grid search for best params
param_grid = ParamGridBuilder() \
            .addGrid(gbt.maxDepth, [3,5,7,9]) \
            .addGrid(gbt.maxIter, [2,5,7,10]) \
            .addGrid(gbt.maxBins, [4,6,8,10]) \
            .build()

from pyspark.ml.evaluation import BinaryClassificationEvaluator
from pyspark.mllib.evaluation import BinaryClassificationMetrics             
# Define evaluation metric: RMSE
evaluator = BinaryClassificationEvaluator(labelCol='Predictor',rawPredictionCol="prediction")
print ("Num models to be tested: ", len(param_grid))

# Build cross validation using CrossValidator
cv = CrossValidator(estimator=gbt, estimatorParamMaps=param_grid, evaluator=evaluator, numFolds=4)

#Fit cross validator to the 'train' dataset
model = cv.fit(df)

#Extract best model from the cv model above and get training predictions
best_model = model.bestModel
train_output = best_model.transform(df)
train_accuracy = evaluator.evaluate(train_output)

# Print the best parameters and rmse
print("**Best Model from Training**")
print("  MaxIter:", best_model._java_obj.parent().getMaxIter())
print("  MaxBins:", best_model._java_obj.parent().getMaxBins())
print("  MaxDepth:", best_model._java_obj.parent().getMaxDepth())
print("  Accuracy:", train_accuracy)

Num models to be tested:  64
**Best Model from Training**
  MaxIter: 2
  MaxBins: 4
  MaxDepth: 3
  Accuracy: 0.854


In [None]:
#Switch to pandas
predictions_gbt = best_model.transform(ml_df)
gbtResults = predictions_gbt.select('UserID', 'ItemID', 'prediction')
gbtResults = gbtResults.toPandas()
gbtResults['TrackID'] = ''
gbtResults.head()

Unnamed: 0,UserID,ItemID,prediction,TrackID
0,199810,208019,0.0,
1,199810,74139,0.0,
2,199810,9903,0.0,
3,199810,242681,0.0,
4,199810,18515,1.0,


In [None]:
evaluator = BinaryClassificationEvaluator(labelCol = 'prediction')
print("Test Area Under ROC: " + str(evaluator.evaluate(predictions_gbt)))

Test Area Under ROC: 1.0


In [None]:
#format for submission
for i in range(len(gbtResults)):
    gbtResults['TrackID'][i] = str(gbtResults['UserID'][i])+'_'+str(gbtResults['ItemID'][i])

gbtResults = gbtResults[['TrackID', 'UserID', 'ItemID', 'prediction']]
gbtResults.drop(columns={'UserID', 'ItemID'}, inplace=True)
gbtResults = gbtResults.rename(columns={'prediction': 'Predictor'})
gbtResults.head()
gbtResults.to_csv('gradient_boosted_tree.csv', index=False)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  This is separate from the ipykernel package so we can avoid doing imports until
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  errors=errors,
