In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

import sklearn.preprocessing as skl_pre
import sklearn.linear_model as skl_lm
import sklearn.discriminant_analysis as skl_da
import sklearn.neighbors as skl_nb
from sklearn.datasets import make_classification
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.model_selection import RandomizedSearchCV

#from IPython.display import set_matplotlib_formats
#set_matplotlib_formats('png')
from IPython.core.pylabtools import figsize
figsize(10, 6) # Width and hight
#plt.style.use('seaborn-white')

In [None]:
url = 'https://raw.githubusercontent.com/Ari-vu/SML/main/Given_data/train.csv'
data = pd.read_csv(url)

In [None]:
#sampling indices for training
np.random.seed(1)
trainI = np.random.choice(data.shape[0], size=500, replace=False) 
trainIndex = data.index.isin(trainI)
train = data.iloc[trainIndex] # training set
test = data.iloc[~trainIndex] # test set



LDA

In [None]:
model = skl_da.LinearDiscriminantAnalysis()
X_train = train[['Number words female', 'Total words', 'Number of words lead', 'Difference in words lead and co-lead', 'Number of male actors', 'Year', 'Number of female actors', 'Number words male', 'Gross', 'Mean Age Male', 'Mean Age Female', 'Age Lead', 'Age Co-Lead']]
Y_train = train['Lead']
X_test = test[['Number words female', 'Total words', 'Number of words lead', 'Difference in words lead and co-lead', 'Number of male actors', 'Year', 'Number of female actors', 'Number words male', 'Gross', 'Mean Age Male', 'Mean Age Female', 'Age Lead', 'Age Co-Lead']]
Y_test = test['Lead']
model.fit(X_train, Y_train)

LinearDiscriminantAnalysis()

In [None]:
predict_prob = model.predict_proba(X_test)
print('The class order in the model:')
print(model.classes_)
print('Examples of predicted probablities for the above classes:') 
with np.printoptions(suppress=True, precision=3): # Supress scienti fic notaion, e.g. 1.0e-2.
  print(predict_prob[0:5]) # inspect the first 5 predictions

The class order in the model:
['Female' 'Male']
Examples of predicted probablities for the above classes:
[[0.083 0.917]
 [0.173 0.827]
 [0.02  0.98 ]
 [0.529 0.471]
 [0.008 0.992]]


In [None]:
model.get_params()

{'covariance_estimator': None,
 'n_components': None,
 'priors': None,
 'shrinkage': None,
 'solver': 'svd',
 'store_covariance': False,
 'tol': 0.0001}

In [None]:
prediction = np.empty(len(X_test), dtype=object)
prediction = np.where(predict_prob[:, 0]>=0.5, 'Female', 'Male')
prediction[0:5] # Inspect the first 5 predictions after labeling

array(['Male', 'Male', 'Male', 'Female', 'Male'], dtype='<U6')

In [None]:
# Confusion matrix
print("Confusion matrix:\n") 
print(pd.crosstab(prediction, Y_test), '\n')
# Accuracy
print(f"Accuracy: {np.mean(prediction == Y_test):.3f}")

Confusion matrix:

Lead    Female  Male
row_0               
Female      71    20
Male        63   385 

Accuracy: 0.846


In [None]:
solvers = ['svd', 'lsqr', 'eigen']

random_grid = {'solver': solvers}

In [None]:
CV = RandomizedSearchCV(estimator=model, param_distributions=random_grid, n_iter=10, cv=3, verbose=2, random_state=1, n_jobs=-1)
CV.fit(X_train, Y_train)

Fitting 3 folds for each of 3 candidates, totalling 9 fits




RandomizedSearchCV(cv=3, estimator=LinearDiscriminantAnalysis(), n_jobs=-1,
                   param_distributions={'solver': ['svd', 'lsqr', 'eigen']},
                   random_state=1, verbose=2)

In [None]:
CV.best_params_

{'solver': 'svd'}

Default values are the best

QDA

In [None]:
model = skl_da.QuadraticDiscriminantAnalysis() 
model.fit(X_train, Y_train)



QuadraticDiscriminantAnalysis()

In [None]:
predict_prob = model.predict_proba(X_test)
print('The class order in the model:')
print(model.classes_, '\n')
print('Examples of predicted probablities for the above classes:') 
with np.printoptions(suppress=True, precision=3):
  print(predict_prob[0:5]) # inspect the first 5 predictions

The class order in the model:
['Female' 'Male'] 

Examples of predicted probablities for the above classes:
[[0.016 0.984]
 [0.038 0.962]
 [0.012 0.988]
 [0.358 0.642]
 [0.001 0.999]]


In [None]:
prediction = np.empty(len(X_test), dtype=object)
prediction = np.where(predict_prob[:, 0]>=0.5, 'Female', 'Male')
print('First five predictions:')
print(prediction[0:5], '\n') # Inspect the first 5 predictions aft er labeling.
# Confusion matrix
print('Confusion matrix:\n') 
print(pd.crosstab(prediction, Y_test), '\n')
# Accuracy
print('Accuracy:')
print(f"{np.mean(prediction == Y_test):.3f}")

First five predictions:
['Male' 'Male' 'Male' 'Male' 'Male'] 

Confusion matrix:

Lead    Female  Male
row_0               
Female      98    36
Male        36   369 

Accuracy:
0.866
