In [159]:
# Load library
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import math 
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn import svm
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_selection import SelectKBest, chi2
from sklearn.feature_selection import mutual_info_classif
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB, MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score
from sklearn.metrics import accuracy_score, f1_score

%matplotlib inline

## Import Data 

In [160]:
# Create URL
test_csv_2 = "C:\\Users\\kenne\\recipe-cooktime-predictor data\\COMP30027_2021_Project2_datasets\\recipe_test.csv"
train_csv_2 = "C:\\Users\\kenne\\recipe-cooktime-predictor data\\COMP30027_2021_Project2_datasets\\recipe_train.csv"

# Load Dataset 
test_df = pd.read_csv(test_csv_2)
train_df = pd.read_csv(train_csv_2)

In [161]:
X_train = train_df[['n_steps', 'n_ingredients']]
y = train_df['duration_label']

In [162]:
import scipy
X_train_name = scipy.sparse.load_npz('C:\\Users\\kenne\\recipe-cooktime-predictor data\\COMP30027_2021_Project2_datasets\\recipe_text_features_countvec\\train_name_vec.npz')
X_test_name = scipy.sparse.load_npz('C:\\Users\\kenne\\recipe-cooktime-predictor data\\COMP30027_2021_Project2_datasets\\recipe_text_features_countvec\\test_name_vec.npz')

X_train_step = scipy.sparse.load_npz('C:\\Users\\kenne\\recipe-cooktime-predictor data\\COMP30027_2021_Project2_datasets\\recipe_text_features_countvec\\train_steps_vec.npz')
X_test_step = scipy.sparse.load_npz('C:\\Users\\kenne\\recipe-cooktime-predictor data\\COMP30027_2021_Project2_datasets\\recipe_text_features_countvec\\test_steps_vec.npz')

X_train_ingr = scipy.sparse.load_npz('C:\\Users\\kenne\\recipe-cooktime-predictor data\\COMP30027_2021_Project2_datasets\\recipe_text_features_countvec\\train_ingr_vec.npz')
X_test_ingr = scipy.sparse.load_npz('C:\\Users\\kenne\\recipe-cooktime-predictor data\\COMP30027_2021_Project2_datasets\\recipe_text_features_countvec\\test_ingr_vec.npz')

### Select Features with CountVec and Mutual Information

In [128]:
# Feature Selection with Mutual Information for 'name'
k = 1000
mi = SelectKBest(score_func=mutual_info_classif, k=k)
name_train_mi = mi.fit_transform(X_train_name,y)
name_test_mi = mi.transform(X_test_name)

In [61]:
# Feature Selection with Mutual Information for 'steps'
mi = SelectKBest(score_func=mutual_info_classif, k=k)
step_train_mi = mi.fit_transform(X_train_step,y)
step_test_mi = mi.transform(X_test_step)

In [62]:
# Feature Selection with Mutual Information for 'steps'
mi = SelectKBest(score_func=mutual_info_classif, k=k)
ingr_train_mi = mi.fit_transform(X_train_ingr,y)
ingr_test_mi = mi.transform(X_test_ingr)

### Merging Features for Training Data

In [163]:
new_train_df = train_df.copy()
new_train_df = new_train_df[['n_steps','n_ingredients']]

In [164]:
name_matrix = name_train_mi.todense()
name_list = name_matrix.tolist()
name_df = pd.DataFrame(name_list)
name_df = name_df.add_prefix('name_')

step_matrix = step_train_mi.todense()
step_list = step_matrix.tolist()
step_df = pd.DataFrame(step_list)
step_df = step_df.add_prefix('step_')

ingr_matrix = ingr_train_mi.todense()
ingr_list = ingr_matrix.tolist()
ingr_df = pd.DataFrame(ingr_list)
ingr_df = ingr_df.add_prefix('ingr_')

In [165]:
features_train = new_train_df.join(name_df)
features_train = features_train.join(step_df)
features_train = features_train.join(ingr_df)

### Test data

In [166]:
new_test_df = test_df.copy()
new_test_df = new_test_df[['n_steps','n_ingredients']]

name_matrix_test = name_test_mi.todense()
name_list_test = name_matrix_test.tolist()
name_df_test = pd.DataFrame(name_list_test)
name_df_test = name_df_test.add_prefix('name_')

step_matrix_test = step_test_mi.todense()
step_list_test = step_matrix_test.tolist()
step_df_test = pd.DataFrame(step_list_test)
step_df_test = step_df_test.add_prefix('step_')

ingr_matrix_test = ingr_test_mi.todense()
ingr_list_test = ingr_matrix_test.tolist()
ingr_df_test = pd.DataFrame(ingr_list_test)
ingr_df_test = ingr_df_test.add_prefix('ingr_')

features_test = new_test_df.join(name_df_test)
features_test = features_test.join(step_df_test)
features_test = features_test.join(ingr_df_test)

## Doc2Vec100

In [67]:
# Load Doc2Vec Dataset 
train_name_doc100 = pd.read_csv("C:\\Users\\kenne\\recipe-cooktime-predictor data\\COMP30027_2021_Project2_datasets\\recipe_text_features_doc2vec100\\recipe_text_features_doc2vec100\\train_name_doc2vec100.csv", header=None)
test_name_doc100 = pd.read_csv("C:\\Users\\kenne\\recipe-cooktime-predictor data\\COMP30027_2021_Project2_datasets\\recipe_text_features_doc2vec100\\recipe_text_features_doc2vec100\\test_name_doc2vec100.csv", header=None)
train_steps_doc100 = pd.read_csv("C:\\Users\\kenne\\recipe-cooktime-predictor data\\COMP30027_2021_Project2_datasets\\recipe_text_features_doc2vec100\\recipe_text_features_doc2vec100\\train_steps_doc2vec100.csv", header=None)
test_steps_doc100 = pd.read_csv("C:\\Users\\kenne\\recipe-cooktime-predictor data\\COMP30027_2021_Project2_datasets\\recipe_text_features_doc2vec100\\recipe_text_features_doc2vec100\\test_steps_doc2vec100.csv", header=None)
train_ingr_doc100 = pd.read_csv("C:\\Users\\kenne\\recipe-cooktime-predictor data\\COMP30027_2021_Project2_datasets\\recipe_text_features_doc2vec100\\recipe_text_features_doc2vec100\\train_ingr_doc2vec100.csv", header=None)
test_ingr_doc100 = pd.read_csv("C:\\Users\\kenne\\recipe-cooktime-predictor data\\COMP30027_2021_Project2_datasets\\recipe_text_features_doc2vec100\\recipe_text_features_doc2vec100\\test_ingr_doc2vec100.csv", header=None)

In [68]:
train_name_doc100 = train_name_doc100.add_suffix('name_doc2vec')
feature_train_doc100 = features_train.join(train_name_doc100)
train_steps_doc100 = train_steps_doc100.add_suffix('step_doc2vec')
feature_train_doc100 = feature_train_doc100.join(train_steps_doc100)
train_ingr_doc100 = train_ingr_doc100.add_suffix('ingr_doc2vec')
feature_train_doc100 = feature_train_doc100.join(train_ingr_doc100)

In [69]:
test_name_doc100 = test_name_doc100.add_suffix('name_doc2vec')
feature_test_doc100 = features_test.join(test_name_doc100)
test_steps_doc100 = test_steps_doc100.add_suffix('step_doc2vec')
feature_test_doc100 = feature_test_doc100.join(test_steps_doc100)
test_ingr_doc100 = test_ingr_doc100.add_suffix('ingr_doc2vec')
feature_test_doc100 = feature_test_doc100.join(test_ingr_doc100)

## Doc2Vec50

In [70]:
train_name_tot = pd.read_csv("C:\\Users\\kenne\\recipe-cooktime-predictor data\\COMP30027_2021_Project2_datasets\\recipe_text_features_doc2vec50\\recipe_text_features_doc2vec50\\train_name_doc2vec50.csv", header=None)
test_name_tot = pd.read_csv("C:\\Users\\kenne\\recipe-cooktime-predictor data\\COMP30027_2021_Project2_datasets\\recipe_text_features_doc2vec50\\recipe_text_features_doc2vec50\\test_name_doc2vec50.csv", header=None)
train_steps_tot = pd.read_csv("C:\\Users\\kenne\\recipe-cooktime-predictor data\\COMP30027_2021_Project2_datasets\\recipe_text_features_doc2vec50\\recipe_text_features_doc2vec50\\train_steps_doc2vec50.csv", header=None)
test_steps_tot = pd.read_csv("C:\\Users\\kenne\\recipe-cooktime-predictor data\\COMP30027_2021_Project2_datasets\\recipe_text_features_doc2vec50\\recipe_text_features_doc2vec50\\test_steps_doc2vec50.csv", header=None)
train_ingr_tot = pd.read_csv("C:\\Users\\kenne\\recipe-cooktime-predictor data\\COMP30027_2021_Project2_datasets\\recipe_text_features_doc2vec50\\recipe_text_features_doc2vec50\\train_ingr_doc2vec50.csv", header=None)
test_ingr_tot = pd.read_csv("C:\\Users\\kenne\\recipe-cooktime-predictor data\\COMP30027_2021_Project2_datasets\\recipe_text_features_doc2vec50\\recipe_text_features_doc2vec50\\test_ingr_doc2vec50.csv", header=None)

In [71]:
train_name_tot = train_name_tot.add_suffix('name_doc2vec_50')
feature_train_tot = features_train.join(train_name_tot)
train_steps_tot = train_steps_tot.add_suffix('step_doc2vec_50')
feature_train_tot = feature_train_tot.join(train_steps_tot)
train_ingr_tot = train_ingr_tot.add_suffix('ingr_doc2vec_50')
feature_train_tot = feature_train_tot.join(train_ingr_tot)

In [72]:
test_name_tot = test_name_tot.add_suffix('name_doc2vec_50')
feature_test_tot = features_test.join(test_name_tot)
test_steps_tot = test_steps_tot.add_suffix('step_doc2vec_50')
feature_test_tot = feature_test_tot.join(test_steps_tot)
test_ingr_tot = test_ingr_tot.add_suffix('ingr_doc2vec_50')
feature_test_tot = feature_test_tot.join(test_ingr_tot)

In [73]:
# Split the training set into train and test set
X_train_tot_split, X_test_tot_split, y_train_tot_split, y_test_tot_split = train_test_split(feature_train_tot, y, test_size=0.33, random_state=88)

# Model using Logistic Regression

In [74]:
lgr = LogisticRegression(C=0.01, solver='sag')
lgr.fit(feature_train_tot, y)
ybar = lgr.predict(feature_test_tot)



In [75]:
test_id = test_df.index
data = {'id': test_id+1, 'duration_label': ybar}
df = pd.DataFrame(data)
df.to_csv('predict.csv', index=False)
acc = lgr.score(X_test_tot_split, y_test_tot_split)
print("Accuracy:",acc)
y_pred = lgr.predict(X_test_tot_split)
print("f1 score:", f1_score(y_test_tot_split, y_pred, average='micro'))

Accuracy: 0.8253787878787879
f1 score: 0.8253787878787879


# ----------------------------------------------------------------------------------------------------------

## Selecting Features with Chi-Square

In [85]:
from sklearn.feature_selection import SelectKBest, chi2

k=1000
x2 = SelectKBest(chi2, k=k)
x2.fit(name_df,y)
X_train_x2 = x2.transform(name_df)
X_test_x2 = x2.transform(name_df_test)

In [86]:
lgr = LogisticRegression()
lgr.fit(X_train_x2, y)
acc = lgr.score(X_train_x2, y)
print("Accuracy:",acc)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Accuracy: 0.721175


## Selecting Features with Mutual Information

In [87]:
k=1000
mi = SelectKBest(score_func=mutual_info_classif, k=k)
mi.fit(name_df,y)
X_train_mi = mi.transform(name_df)
X_test_mi = mi.transform(name_df_test)

In [88]:
lgr = LogisticRegression()
lgr.fit(X_train_mi, y)
acc = lgr.score(X_train_mi, y)
print("Accuracy:",acc)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Accuracy: 0.721175


## Evaluation - Holdout or Cross Validation

### Holdout Strategy

In [90]:
lgr = LogisticRegression()
lgr.fit(X_train_tot_split, y_train_tot_split)
acc = lgr.score(X_train_tot_split, y_train_tot_split)
print("Accuracy:",acc)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Accuracy: 0.8372014925373135


### Cross-Validation Strategy

In [91]:
lgr = LogisticRegression()
lgr.fit(feature_train_tot, y)
acc = np.mean(cross_val_score(lgr, feature_train_tot, y, cv=5))
print("Accuracy:",acc)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
Traceback (most recent call last):
  File "c:\users\kenne\python\lib\site-packages\sklearn\model_selection\_validation.py", line 593, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "c:\users\kenne\python\lib\site-packages\sklearn\linear_model\_logistic.py", line 1344, in fit
    X, y = self._validate_data(X, y, accept_sparse='csr', dtype=_dtype,
  File "c:\users\kenne\python\lib\site-packages\sklearn\base.py", line 433, in _validate_data
    X, y = check_X_y(X, y, **check_params)
  File "c:\users\kenne\python\lib\site-packages\sklearn\utils\validation.py", line 63, in inner_f
    return f(*

Accuracy: nan


Traceback (most recent call last):
  File "c:\users\kenne\python\lib\site-packages\sklearn\model_selection\_validation.py", line 593, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "c:\users\kenne\python\lib\site-packages\sklearn\linear_model\_logistic.py", line 1344, in fit
    X, y = self._validate_data(X, y, accept_sparse='csr', dtype=_dtype,
  File "c:\users\kenne\python\lib\site-packages\sklearn\base.py", line 433, in _validate_data
    X, y = check_X_y(X, y, **check_params)
  File "c:\users\kenne\python\lib\site-packages\sklearn\utils\validation.py", line 63, in inner_f
    return f(*args, **kwargs)
  File "c:\users\kenne\python\lib\site-packages\sklearn\utils\validation.py", line 814, in check_X_y
    X = check_array(X, accept_sparse=accept_sparse,
  File "c:\users\kenne\python\lib\site-packages\sklearn\utils\validation.py", line 63, in inner_f
    return f(*args, **kwargs)
  File "c:\users\kenne\python\lib\site-packages\sklearn\utils\validation.py", 

In [93]:
print(acc)

0.36984848484848487


#### Both Holdout and Cross-validation get similar accuracy score. However, holdout is significantly faster than cross validation and for this reason, holdout strategy will be chosen 

## Different Models

In [105]:
from sklearn.naive_bayes import GaussianNB, MultinomialNB
models = [GaussianNB(),
          KNeighborsClassifier(n_neighbors=5),
          DecisionTreeClassifier(max_depth=None),
          DecisionTreeClassifier(max_depth=1)]
titles = ['GNB',
          '5-nearest neighbour',
          'Decision Tree',
          'One_R']

for title, model in zip(titles, models):
    model.fit(X_train_tot_split,y_train_tot_split)
    acc = model.score(X_test_tot_split,y_test_tot_split)
    print(title, "Accuracy:",acc)

GNB Accuracy: 0.36984848484848487
5-nearest neighbour Accuracy: 0.7217424242424243
Decision Tree Accuracy: 0.7106818181818182
One_R Accuracy: 0.6552272727272728


In [109]:
log = LogisticRegression()
log.fit(X_train_tot_split,y_train_tot_split)
acc = log.score(X_test_tot_split,y_test_tot_split)
print("LogisticRegression Accuracy:",acc)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


LogisticRegression Accuracy: 0.8018181818181818


## Stacking, Ensemble Model

In [110]:
from sklearn.metrics import accuracy_score

np.random.seed(1)

class StackingClassifier():

    def __init__(self, classifiers, metaclassifier):
        self.classifiers = classifiers
        self.metaclassifier = metaclassifier

    def fit(self, X, y):
        for clf in self.classifiers:
            clf.fit(X, y)
        X_meta = self._predict_base(X)
        self.metaclassifier.fit(X_meta, y)
    
    def _predict_base(self, X):
        yhats = []
        for clf in self.classifiers:
            yhat = clf.predict_proba(X)
            yhats.append(yhat)
        yhats = np.concatenate(yhats, axis=1)
        assert yhats.shape[0] == X.shape[0]
        return yhats
    
    def predict(self, X):
        X_meta = self._predict_base(X)     
        yhat = self.metaclassifier.predict(X_meta)
        return yhat
    def score(self, X, y):
        yhat = self.predict(X)
        return accuracy_score(y, yhat)
    


classifiers = [DecisionTreeClassifier(), KNeighborsClassifier(), MultinomialNB()]
titles = ['Decision Tree', 'KNeighborsClassifier()', 'Multinomial NB']



meta_classifier_lr = LogisticRegression()
stacker_lr = StackingClassifier(classifiers, meta_classifier_lr)

In [144]:
# Split the training set into train and test set
X_train_split, X_test_split, y_train_split, y_test_split = train_test_split(X_train, y, test_size=0.33, random_state=88)

stacker_lr.fit(X_train_split, y_train_split)
acc = stacker_lr.score(X_test_split, y_test_split)
print("Stacker Accuracy:",acc)

Stacker Accuracy: 0.6261363636363636


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


# Model Tuning

First, binary logistic regression requires the dependent variable to be binary and ordinal logistic regression requires the dependent variable to be ordinal.

Second, logistic regression requires the observations to be independent of each other.  In other words, the observations should not come from repeated measurements or matched data.

Third, logistic regression requires there to be little or no multicollinearity among the independent variables.  This means that the independent variables should not be too highly correlated with each other.

Fourth, logistic regression assumes linearity of independent variables and log odds.  although this analysis does not require the dependent and independent variables to be related linearly, it requires that the independent variables are linearly related to the log odds.

Finally, logistic regression typically requires a large sample size.  A general guideline is that you need at minimum of 10 cases with the least frequent outcome for each independent variable in your model. For example, if you have 5 independent variables and the expected probability of your least frequent outcome is .10, then you would need a minimum sample size of 500 (10*5 / .10).

In [145]:
# Logistic Regression 
from sklearn.linear_model import LogisticRegression
lg2 = LogisticRegression(class_weight='balanced') # inverse class weighting to reduce bias
lg2.fit(X_train_tot_split,y_train_tot_split)

acc = lg2.score(X_test_tot_split, y_test_tot_split)
print("Accuracy:",acc)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Accuracy: 0.7859090909090909


## Scaling

In [149]:
# Normalizing features - binary logistic regression requires the dependent variable to be binary and ordinal logistic regression requires the dependent variable to be ordinal.
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()

scaler.fit(X_train_split)
X_train_scaled = scaler.transform(X_train_split)
X_test_scaled = scaler.transform(X_test_split)

In [150]:
# Does Scaling + fix class imbalance + use a faster solve improve performance?
from sklearn.linear_model import LogisticRegression
lg2 = LogisticRegression(class_weight='balanced', solver = "sag") # inverse class weighting to reduce bias, solve faster
lg2.fit(X_train_scaled,y_train_tot_split)

acc = lg2.score(X_test_scaled , y_test_tot_split)
print("Accuracy:",acc)

Accuracy: 0.5453787878787879


In [151]:
# Does Scaling improve performance?  
from sklearn.linear_model import LogisticRegression
lgr = LogisticRegression()
lgr.fit(X_train_scaled,y_train_tot_split)

LogisticRegression()

In [152]:
acc = lgr.score(X_test_scaled , y_test_tot_split)
print("Accuracy:",acc)

Accuracy: 0.6317424242424242


#### Scaling does not seem to improve the performance 

## PCA - Reducing Dimensions

In [153]:
# Third, logistic regression requires there to be little or no multicollinearity among the independent variables. This means that the independent variables should not be too highly correlated with each other.
# PCA
from sklearn.decomposition import PCA

# Create a PCA that will retain 99% of variance 
pca = PCA(n_components=0.99, whiten = True)

# Conduct PCA
X_train_pca = pca.fit_transform(X_train_scaled)

# Show results 
print("Original number of features:", X_train_scaled.shape[1])
print("Reduced number of features:", X_train_pca.shape[1])

Original number of features: 2
Reduced number of features: 2


In [154]:
# Conduct PCA from fitted set
X_test_pca = pca.transform(X_test_scaled)

In [155]:
# Does PCA improve performance? 
from sklearn.linear_model import LogisticRegression
lgr = LogisticRegression()
lgr.fit(X_train_pca,y_train_tot_split)

LogisticRegression()

In [156]:
acc = lgr.score(X_test_pca , y_test_tot_split)
print("Accuracy:",acc)

Accuracy: 0.6317424242424242


In [157]:
# Does Scaling + fix class imbalance + use a faster solve improve performance + reduce correlation? 
from sklearn.linear_model import LogisticRegression
lg2 = LogisticRegression(class_weight='balanced', solver = "sag") # inverse class weighting to reduce bias, solve faster
lg2.fit(X_train_pca,y_train_tot_split)

acc = lg2.score(X_test_pca, y_test_tot_split)
print("Accuracy:",acc)

Accuracy: 0.5453787878787879


## Tune Hyperparameters

In [167]:
lg1 = LogisticRegression()
lg1.fit(X_train_tot_split, y_train_tot_split)
acc = lg1.score(X_test_tot_split, y_test_tot_split)
print("Accuracy:",acc)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Accuracy: 0.8018181818181818


In [168]:
lg2 = LogisticRegression(C=0.1, solver='sag')
lg2.fit(X_train_tot_split, y_train_tot_split)
acc = lg2.score(X_test_tot_split, y_test_tot_split)
print("Accuracy:",acc)



Accuracy: 0.8032575757575757


In [169]:
lg3 = LogisticRegression(C=0.001, solver='sag')
lg3.fit(X_train_tot_split, y_train_tot_split)
ybar = lg3.predict(feature_test_tot)
test_id = test_df.index
data = {'id': test_id+1, 'duration_label': ybar}
df = pd.DataFrame(data)
df.to_csv('sag_split_0_001.csv', index=False)



In [170]:
lg4 = LogisticRegression(C=1.0, solver='newton-cg')
lg4.fit(feature_train_tot, y)
ybar = lg4.predict(feature_test_tot)
test_id = test_df.index
data = {'id': test_id+1, 'duration_label': ybar}
df = pd.DataFrame(data)
df.to_csv('newt_tot.csv', index=False)

In [171]:
lgr5 = LogisticRegression(C=1.0, solver='saga')
lgr5.fit(X_train_tot_split, y_train_tot_split)
ybar = lgr5.predict(feature_test_tot)
test_id = test_df.index
data = {'id': test_id+1, 'duration_label': ybar}
df = pd.DataFrame(data)
df.to_csv('sag_split.csv', index=False)

