In [None]:
import pandas as pd

from matplotlib import pyplot as plt
import seaborn as sb

In [None]:
ads_data_df = pd.read_csv('datasets/Social_Network_Ads.csv')
ads_data_df.info()
ads_data_df.describe()

In [None]:
corr = ads_data_df.corr()
plt.figure(figsize=[5,4])
sb.heatmap(corr)
plt.title('Correlation Matrix')
plt.plot()

In [None]:
from sklearn.preprocessing import StandardScaler

z_scalar = StandardScaler()

ads_data_df[['Age']] = z_scalar.fit_transform(ads_data_df[['Age']])
ads_data_df[['EstimatedSalary']] = z_scalar.fit_transform(ads_data_df[['EstimatedSalary']])

print(f"Means: \n{ads_data_df[['Age', 'EstimatedSalary']].mean()}")
print(f"Std:   \n{ads_data_df[['Age', 'EstimatedSalary']].std()}")

In [None]:
from sklearn.model_selection import train_test_split

train, test = train_test_split(ads_data_df, random_state=7, test_size=0.2)
train.info()

In [None]:
y_train = train[['Purchased']]
x_train = train.drop(['Purchased'], axis=1)
print(x_train.info())

y_test = train[['Purchased']]
x_test = train.drop(['Purchased'], axis=1)
x_test.info()

In [None]:
from sklearn.linear_model import LogisticRegression

logistci_regression_model = LogisticRegression()
logistci_regression_model.fit(x_train, y_train)

y_pred = logistci_regression_model.predict(x_test)
y_pred_prob = logistci_regression_model.predict_proba(x_test)


In [None]:
from sklearn.metrics import classification_report, confusion_matrix, ConfusionMatrixDisplay, accuracy_score, roc_auc_score, roc_curve, RocCurveDisplay

print(f"Accuracy Score: {accuracy_score(y_test, y_pred)}")
print(f"ROC-AUC Score: {roc_auc_score(y_test, y_pred)}")
print(f"Classification Report: \n {classification_report(y_test, y_pred)}")

fpr, tpr, _ = roc_curve(y_test, y_pred_prob[:, 1])
rocd = RocCurveDisplay(fpr=fpr, tpr=tpr)
rocd.plot()
cm = confusion_matrix(y_test, y_pred)
ConfusionMatrixDisplay(cm).plot()



In [None]:
# feature importances
from sklearn.ensemble import RandomForestClassifier

randomForestClassifier = RandomForestClassifier()
randomForestClassifier.fit(x_train, y_train)

importances = randomForestClassifier.feature_importances_
feature_names = randomForestClassifier.feature_names_in_

plt.figure(figsize=(5, 3))
plt.barh(feature_names, importances)
plt.title('Feature Importances')
plt.plot()

In [None]:
# Do xy plot

plt.figure(figsize=(5,3))
plt.scatter(x_test[['Age']], x_test[['EstimatedSalary']], color='blue', label='Test Data')
plt.Line2D(x_test[['Age']], y_pred)
plt.plot()



In [None]:
from sklearn.neighbors import KNeighborsClassifier

knn_classifier = KNeighborsClassifier(n_neighbors=5, metric=