In [None]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import MinMaxScaler
from sklearn.linear_model import LogisticRegression

In [None]:
# upload data
url = 'https://raw.githubusercontent.com/Ari-vu/SML/main/Given_data/train.csv'
df = pd.read_csv(url)

In [None]:
# split train and test data
perc = 0.7 # percentage of training data
random_state = 10
np.random.seed(random_state)
trainIndex = np.random.choice(df.shape[0], size=int(perc*df.shape[0]), replace=False)
train = df.iloc[trainIndex]
test = df.iloc[~df.index.isin(trainIndex)]

In [None]:
# define features (X) and label (y)
X_train = train.drop(columns=['Lead'])
y_train = train['Lead']
X_test = test.drop(columns=['Lead'])
y_test = test['Lead']

# normalizing feature values
norm = MinMaxScaler().fit(X_train)
X_train_norm = norm.transform(X_train)
X_test_norm = norm.transform(X_test)

In [None]:
# define and fit model
model = LogisticRegression(solver='liblinear', random_state=random_state)
model.fit(X_train_norm, y_train)

LogisticRegression(random_state=10, solver='liblinear')

In [None]:
# calculate accuracy
print('Accuracy on training data: %.4f' %np.mean(model.predict(X_train_norm)==y_train))
print('Accuracy on test data: %.4f' %np.mean(model.predict(X_test_norm)==y_test))
pd.crosstab(model.predict(X_test_norm), y_test)

Accuracy on training data: 0.8033
Accuracy on test data: 0.8237


Lead,Female,Male
row_0,Unnamed: 1_level_1,Unnamed: 2_level_1
Female,23,8
Male,47,234


In [None]:
# calculate importance of each feature
importances = model.coef_[0]
feature_importances = [(feature, round(importance, 2)) for feature, importance
                       in zip(X_train.columns, importances)]
feature_importances = sorted(feature_importances, key = lambda x: x[1], reverse = True)
[print('Feature: {:40} Score: {}'.format(*pair)) for pair in feature_importances];

Feature: Age Lead                                 Score: 3.2
Feature: Number of male actors                    Score: 2.95
Feature: Difference in words lead and co-lead     Score: 2.52
Feature: Number words female                      Score: 2.11
Feature: Number of words lead                     Score: 0.32
Feature: Total words                              Score: 0.21
Feature: Mean Age Female                          Score: 0.16
Feature: Year                                     Score: -0.03
Feature: Gross                                    Score: -0.13
Feature: Mean Age Male                            Score: -0.39
Feature: Number words male                        Score: -1.37
Feature: Age Co-Lead                              Score: -2.69
Feature: Number of female actors                  Score: -4.08


In [None]:
# remove features with score 0
X_train2 = train.drop(columns=['Lead', 'Number words female', 'Total words', 'Number of words lead',
                               'Difference in words lead and co-lead', 'Year', 'Number words male',
                               'Gross'])
X_test2 = test.drop(columns=['Lead', 'Number words female', 'Total words', 'Number of words lead',
                               'Difference in words lead and co-lead', 'Year', 'Number words male',
                              'Gross'])

norm = MinMaxScaler().fit(X_train2)
X_train_norm2 = norm.transform(X_train2)
X_test_norm2 = norm.transform(X_test2)

In [None]:
model2 = LogisticRegression(solver='liblinear', random_state=random_state)
model2.fit(X_train2, y_train)

LogisticRegression(random_state=10, solver='liblinear')

In [None]:
# calculate accuracy
print('Accuracy on training data: %.4f' %np.mean(model2.predict(X_train2)==y_train))
print('Accuracy on test data: %.4f' %np.mean(model2.predict(X_test2)==y_test))
pd.crosstab(model2.predict(X_test2), y_test)

Accuracy on training data: 0.8019
Accuracy on test data: 0.8045


Lead,Female,Male
row_0,Unnamed: 1_level_1,Unnamed: 2_level_1
Female,31,22
Male,39,220
