In [None]:
import warnings

warnings.filterwarnings('ignore')

In [None]:
import pandas as pd

In [None]:
md_df=pd.read_csv('Microsoft_Data.csv')
md_df

In [None]:
md_df.shape

# Drop the columns

In [None]:
# remove na>50

na_df=pd.DataFrame({'col_name':md_df.columns,'na_per':md_df.isnull().sum()/md_df.shape[0]*100})

na_grt50=list(na_df[na_df['na_per']>50]['col_name'])

na_grt50

In [None]:
# non-sense columns

non_sense=[col for col in md_df.columns if md_df[col].nunique==1 or md_df[col].nunique==md_df[col].shape[0]]

non_sense

In [None]:
col_to_drop=na_grt50+non_sense
col_to_drop

In [None]:
# drop columns now

md_df.drop(columns=col_to_drop,inplace=True)


# chi-square test

In [None]:

from scipy.stats import chi2_contingency

con_cols=[col for col in md_df.columns if md_df[col].dtype=='int64' or md_df[col].dtype=='float64']

cat_cols=[col for col in md_df.columns if md_df[col].dtype=='object']

sig_val=0.05
chi2_corr_cols=[]
chi2_nocorr_cols=[]

for col in cat_cols:
    contingency_table=pd.crosstab(md_df[col],md_df['HasDetections'])
    p_value=chi2_contingency(contingency_table)[1]
    if p_value<sig_val:
        chi2_corr_cols.append(col)
    else:
        chi2_nocorr_cols.append(col)
    
print(chi2_corr_cols)

In [None]:
print(chi2_nocorr_cols)

# Anova test

In [None]:
from scipy.stats import f_oneway

sig_val=0.05
anova_corr_cols=[]
anova_non_corr_cols=[]

con_cols=[col for col in md_df.columns if md_df[col].dtype=='int64' or md_df[col].dtype=='float64']

for col in con_cols:
    y_list=list(md_df[md_df['HasDetections']==1][col])
    n_list=list(md_df[md_df['HasDetections']==0][col])
    
    p_value=f_oneway(y_list,n_list)[1]
    
    if p_value<sig_val:
        anova_corr_cols.append(col)
    else:
        anova_non_corr_cols.append(col)
        

print(anova_corr_cols)
print(anova_non_corr_cols)

In [None]:
all_non_corr_cols=anova_non_corr_cols+chi2_nocorr_cols
y=md_df['HasDetections']
all_non_corr_cols.append('HasDetections')

md_df.drop(columns=all_non_corr_cols,inplace=True)


# Train-Test split

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
x_train,x_test,y_train,y_test=train_test_split(md_df,y,train_size=0.8,random_state=42)

In [None]:
# fill NA values
x_test.shape

In [None]:
con_cols=[col for col in md_df.columns if md_df[col].dtype=='int64' or md_df[col].dtype=='float64']

cat_cols=[col for col in md_df.columns if md_df[col].dtype=='object']

In [None]:
for col in con_cols:
    x_train[col].fillna(x_train[col].mean(),inplace=True)
    x_test[col].fillna(x_train[col].mean(),inplace=True)
    
for col in cat_cols:
    x_train[col].fillna(x_train[col].mode()[0],inplace=True)
    x_test[col].fillna(x_train[col].mode()[0],inplace=True)

# scaling

In [None]:
from sklearn.preprocessing import StandardScaler
import numpy as np

In [None]:
std_scalar=StandardScaler()


for col in con_cols:
    x_train[col]=std_scalar.fit_transform(np.array(x_train[col]).reshape(-1,1))
    x_test[col]=std_scalar.transform(np.array(x_test[col]).reshape(-1,1))
                                          

In [None]:
oe_train=pd.get_dummies(x_train[cat_cols])

oe_test=pd.get_dummies(x_test[cat_cols])

In [None]:
print(oe_train.shape)
print(oe_test.shape)

In [None]:
oe_train_final,oe_test_final=oe_train.align(oe_test,join='inner',axis=1)

In [None]:
print(oe_train_final.shape)
print(oe_test_final.shape)

In [None]:
x_train_final=pd.concat([x_train[con_cols],oe_train_final],axis=1)
x_test_final=pd.concat([x_test[con_cols],oe_test_final],axis=1)

# LOGISTIC REGRESSION

In [None]:
from sklearn.linear_model import LogisticRegression

In [None]:
logreg=LogisticRegression()

In [None]:
logreg.fit(x_train_final,y_train)

# Lasso and Ridge

In [None]:
from sklearn.linear_model import SGDClassifier

lasso=SGDClassifier(penalty='l1', alpha=0.01, l1_ratio=0.15, verbose=1)
ridge=SGDClassifier(penalty='l2', alpha=0.01, l1_ratio=0.15, verbose=1)

lasso.fit(x_train_final,y_train)
ridge.fit(x_train_final,y_train)

lasso_coeffs=lasso.coef_
ridge_coeffs=ridge.coef_


len(lasso_coeffs[lasso_coeffs==0])
len(ridge_coeffs[ridge_coeffs==0])

lasso_test_pred=lasso.predict(x_test_final)
ridge_test_pred=ridge.predict(x_test_final)

from sklearn.metrics import accuracy_score

print('lasso_test_accuracy:',accuracy_score(y_test,lasso_test_pred))
print('ridge_test_accuracy:',accuracy_score(y_test,ridge_test_pred))

In [None]:
test_pred=logreg.predict(x_test_final)
train_pred=logreg.predict(x_train_final)

In [None]:
probabs=logreg.predict_proba(x_test_final)

pos_probabs=probabs[::,1]

In [None]:
# auc-roc curve
from sklearn.metrics import roc_curve

fpr,tpr,thresholds=roc_curve(y_test,pos_probabs)

In [None]:
import matplotlib.pyplot as plt

plt.plot(fpr,tpr)

In [None]:
from sklearn.metrics import roc_auc_score

roc_auc_score(y_test,test_pred)

In [None]:
# confusion matrix

from sklearn.metrics import confusion_matrix

confusion_matrix(y_test,test_pred)

In [None]:
from sklearn.metrics import precision_score

precision_score(y_test,test_pred)

In [None]:
from sklearn.metrics import recall_score

recall_score(y_test,test_pred)

In [None]:
from sklearn.metrics import f1_score 

f1_score(y_test,test_pred)

In [None]:
from sklearn.metrics import accuracy_score

accuracy_score(y_test,test_pred) # Test accuracy

In [None]:
accuracy_score(y_train,train_pred) # Train accuracy

# DECISION TREE

In [None]:
from sklearn.tree import DecisionTreeClassifier

In [None]:
d_tree=DecisionTreeClassifier(criterion='entropy')

In [None]:
d_tree.fit(x_train_final,y_train)

d_tree_test_pred=d_tree.predict(x_test_final)
d_tree_train_pred=d_tree.predict(x_train_final)

d_tree_probabs=d_tree.predict_proba(x_test_final)
pos_probabs=d_tree_probabs[::,1]

# auc-roc curve
from sklearn.metrics import roc_curve
fpr,tpr,thresholds=roc_curve(y_test,pos_probabs)

In [None]:
import matplotlib.pyplot as plt

plt.plot(fpr,tpr)

In [None]:
from sklearn.metrics import roc_auc_score

roc_auc_score(y_test,d_tree_test_pred)

In [None]:
# confusion matrix

from sklearn.metrics import confusion_matrix

confusion_matrix(y_test,d_tree_test_pred)

In [None]:
from sklearn.metrics import precision_score

precision_score(y_test,d_tree_test_pred)

In [None]:
from sklearn.metrics import recall_score

recall_score(y_test,d_tree_test_pred)

In [None]:
from sklearn.metrics import f1_score 

f1_score(y_test,d_tree_test_pred)

In [None]:
from sklearn.metrics import accuracy_score

print(accuracy_score(y_train,d_tree_train_pred)) # Train accuracy
print(accuracy_score(y_test,d_tree_test_pred)) # Test accuracy

In [None]:
from sklearn.tree import plot_tree

plt.figure(figsize=(25,10))
a = plot_tree(d_tree)