# Logistic Regression Algorithm

## Classical imports and getting the training dataset

In [None]:
# Necessary imports
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
#Fitting models
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
# Scoring functions
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report
from sklearn.metrics import roc_auc_score
from sklearn.metrics import roc_curve

In [None]:
#Getting the dataset
%run cleanData.ipynb

In [None]:
# Checking that the dataset is ok
df

In [None]:
# A last clean to have float64 values
def clean_dataset(df):
    assert isinstance(df, pd.DataFrame), "df needs to be a pd.DataFrame"
    df.dropna(inplace=True)
    indices_to_keep = ~df.isin([np.nan, np.inf, -np.inf]).any(1)
    return df[indices_to_keep].astype(np.float64)

df=clean_dataset(df)

## Logistic Regression Model

In [None]:
#To see what the best parameters are
def best_model(model):
    print(model.best_score_)    
    print(model.best_params_)
    print(model.best_estimator_)

In [None]:
#splitting our dataset into one set of training and one set of test
y = df.washDishes.values
x = df.drop(['washDishes'], axis = 1)
x_train, x_test, y_train, y_test = train_test_split(x,y,test_size = 0.2,random_state=0)

In [None]:
#Logistic Regression
accuracies = {} 
param_grid = {'C': [0.1,0.2,0.5,1,10], 'max_iter': [250, 500], 'fit_intercept':[True],'intercept_scaling':[1],
              'penalty':['l2'], 'tol':[0.00001,0.0001,0.000001]} #testing different parameters for C and max_iter
#gridsearch automatically takes the best parameter after testing all the options
lr = GridSearchCV(LogisticRegression(solver='lbfgs'),param_grid, cv=5, refit=True, verbose=0) 
lr.fit(x_train,y_train)
acc = lr.score(x_test,y_test)*100
accuracies['Logistic Regression'] = acc
print(best_model(lr))
print("Test Accuracy {:.2f}%".format(acc))

## Prediction

In [None]:
#Predicting
%run cleanData_test.ipynb
df_test

In [None]:
df3=pd.read_csv(r"DS_CentraleSupelec_ST42021/DS_CentraleSupelec_test.csv") #to get the userID
y_pred = lr.fit(x_train, y_train).predict(df_test) #the prediction
sol=pd.DataFrame({'userId':df3['userId'],'washDishes':y_pred}) #fusion with userId
sol["washDishes"] = sol["washDishes"].apply(lambda e: 'Auto' if e == 1 else 'Hand')#replacing the -1 and 1 with the original values
sol.to_csv('./DS_CentraleSupelec_ST42021/lrsolution.csv', index=False)
sol