In [None]:
[1]: Import libraries
import os
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import pickle
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from sklearn.utils import resample
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
import xgboost as xgb

# Ensure output directories for the web app
os.makedirs("web/static", exist_ok=True)
# In [2]: Load dataset
df = pd.read_csv(r"C:\Users\Nandh\OneDrive\Documents\project\train.csv")
df.head()




# In [3]: Inspect dataset
df.info()
df.describe()




# In [4]: Check class distribution and missing values
print(df["Fake"].value_counts())
print(df.isnull().sum())





# In [5]: Visualize class imbalance
sns.set(style="whitegrid")
plt.figure(figsize=(8,4))
sns.countplot(x="Fake Or Not Category", data=df, palette="cool")
plt.title("Class Label Counts")
plt.tight_layout()
plt.show()





# In [6]: Remove duplicates
df = df.drop_duplicates()
df.shape






# In [7]: Balance the dataset (upsample minority, downsample majority)
df_majority = df[df['Fake Or Not Category'] == 1]
df_minority = df[df['Fake Or Not Category'] == 0]

n_target = min(len(df_majority), len(df_minority))
if n_target < 300:
    n_target = 500  # fallback target

df_minority_upsampled = resample(df_minority, replace=True, n_samples=n_target, random_state=123)
df_majority_downsampled = resample(df_majority, replace=False, n_samples=n_target, random_state=123)

df_balanced = pd.concat([df_minority_upsampled, df_majority_downsampled]).sample(frac=1, random_state=123)
df_balanced['Fake Or Not Category'].value_counts()



# In [8]: Visualize balanced data
plt.figure(figsize=(8,4))
sns.countplot(x="Fake Or Not Category", data=df_balanced, palette="cubehelix")
plt.title("Balanced Class Counts")
plt.tight_layout()
plt.show()





# In [9]: Drop unused columns (adjust to your dataset)
df = df_balanced.copy()
for col in ["UserID", "profile pic"]:
    if col in df.columns:
        df = df.drop(columns=[col])
df.head()



# In [10]: Split features and target
X = df.drop(columns=['Fake Or Not Category'])
y = df['Fake Or Not Category']
X.shape, y.shape



# In [11]: Train-test split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.30, stratify=y, random_state=40
)




# In [12]: Feature scaling
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# Save test features for any external check if needed
pd.DataFrame(X_test).to_csv("web/static/test_features_preview.csv", index=False)

# Persist scaler for the web app
pickle.dump(scaler, open("web/scaler.pkl", "wb"))






# In [13]: KNN
knn_model = KNeighborsClassifier()
knn_model.fit(X_train, y_train)
pred_knn = knn_model.predict(X_test)
train_accuracy = accuracy_score(y_train, knn_model.predict(X_train))
test_accuracy = accuracy_score(y_test, pred_knn)

pickle.dump(knn_model, open("web/knn_model.pkl", "wb"))
print("KNN Train:", train_accuracy, "Test:", test_accuracy)
print(classification_report(y_test, pred_knn))




# In [14]: Confusion matrix KNN
cm_knn = confusion_matrix(y_test, pred_knn)
sns.heatmap(pd.DataFrame(cm_knn, index=["Real","Fake"], columns=["Real","Fake"]), annot=True, fmt="d", cmap="Blues")
plt.title("Confusion Matrix - KNN")
plt.tight_layout()
plt.show()





# In [15]: Logistic Regression
log_model = LogisticRegression(max_iter=1000)
log_model.fit(X_train, y_train)
pred_log = log_model.predict(X_test)
train_accuracy1 = accuracy_score(y_train, log_model.predict(X_train))
test_accuracy1 = accuracy_score(y_test, pred_log)

pickle.dump(log_model, open("web/log_model.pkl", "wb"))
print("LogReg Train:", train_accuracy1, "Test:", test_accuracy1)
print(classification_report(y_test, pred_log))





# In [16]: Confusion matrix Logistic Regression
cm_log = confusion_matrix(y_test, pred_log)
sns.heatmap(pd.DataFrame(cm_log, index=["Real","Fake"], columns=["Real","Fake"]), annot=True, fmt="d", cmap="Blues")
plt.title("Confusion Matrix - Logistic Regression")
plt.tight_layout()
plt.show()





# In [17]: Decision Tree (simple tuned defaults; adjust as needed)
dt = DecisionTreeClassifier(criterion='entropy', max_depth=5, min_samples_split=5, random_state=42)
dt.fit(X_train, y_train)
pred_dt = dt.predict(X_test)
train_accuracy2 = accuracy_score(y_train, dt.predict(X_train))
test_accuracy2 = accuracy_score(y_test, pred_dt)

pickle.dump(dt, open("web/dt_model.pkl", "wb"))
print("DecisionTree Train:", train_accuracy2, "Test:", test_accuracy2)
print(classification_report(y_test, pred_dt))







# In [18]: Confusion matrix Decision Tree
cm_dt = confusion_matrix(y_test, pred_dt)
sns.heatmap(pd.DataFrame(cm_dt, index=["Real","Fake"], columns=["Real","Fake"]), annot=True, fmt="d", cmap="Blues")
plt.title("Confusion Matrix - Decision Tree")
plt.tight_layout()
plt.show()







# In [19]: Random Forest
rf = RandomForestClassifier(max_depth=10, n_estimators=120, random_state=42)
rf.fit(X_train, y_train)
pred_rf = rf.predict(X_test)
train_accuracy3 = accuracy_score(y_train, rf.predict(X_train))
test_accuracy3 = accuracy_score(y_test, pred_rf)

pickle.dump(rf, open("web/rf_model.pkl", "wb"))
print("RandomForest Train:", train_accuracy3, "Test:", test_accuracy3)
print(classification_report(y_test, pred_rf))





# In [20]: Confusion matrix Random Forest
cm_rf = confusion_matrix(y_test, pred_rf)
sns.heatmap(pd.DataFrame(cm_rf, index=["Real","Fake"], columns=["Real","Fake"]), annot=True, fmt="d", cmap="Blues")
plt.title("Confusion Matrix - Random Forest")
plt.tight_layout()
plt.show()





# In [21]: XGBoost
xgb_model = xgb.XGBClassifier(
    n_estimators=500, learning_rate=0.05, max_depth=5,
    subsample=0.9, colsample_bytree=0.9, random_state=42, n_jobs=-1
)
xgb_model.fit(X_train, y_train)
pred_xgb = xgb_model.predict(X_test)
train_accuracy4 = accuracy_score(y_train, xgb_model.predict(X_train))
test_accuracy4 = accuracy_score(y_test, pred_xgb)

pickle.dump(xgb_model, open("web/xgb_model.pkl", "wb"))
print("XGBoost Train:", train_accuracy4, "Test:", test_accuracy4)
print(classification_report(y_test, pred_xgb))






# In [22]: Confusion matrix XGBoost
cm_xgb = confusion_matrix(y_test, pred_xgb)
sns.heatmap(pd.DataFrame(cm_xgb, index=["Real","Fake"], columns=["Real","Fake"]), annot=True, fmt="d", cmap="Blues")
plt.title("Confusion Matrix - XGBoost")
plt.tight_layout()
plt.show()




# In [30]: Confusion matrices side by side (all models)
cms = {
    "KNN": cm_knn,
    "Logistic Regression": cm_log,
    "Decision Tree": cm_dt,
    "Random Forest": cm_rf,
    "XGBoost": cm_xgb
}
class_names = ['Real', 'Fake']
fig, axes = plt.subplots(1, 5, figsize=(25,5))
for ax, (title, cm) in zip(axes, cms.items()):
    cm_df = pd.DataFrame(cm, index=class_names, columns=class_names)
    sns.heatmap(cm_df, annot=True, fmt="d", cmap="Blues", ax=ax)
    ax.set_title(title)
    ax.set_ylabel("Actual")
    ax.set_xlabel("Predicted")
plt.tight_layout()
plt.savefig("web/static/confusion_matrices.png", dpi=150)
plt.show()




# In [31]: Final comparison table
all_model_result = pd.DataFrame([
    ['KNN-Classifier', train_accuracy, test_accuracy],
    ['Logistic regression', train_accuracy1, test_accuracy1],
    ['Decision Tree-Classifier', train_accuracy2, test_accuracy2],
    ['Random Forest', train_accuracy3, test_accuracy3],
    ['XGBoost', train_accuracy4, test_accuracy4]
], columns=['Classifier', 'Train-Accuracy', 'Test-Accuracy'])

print(all_model_result)
all_model_result.to_csv("web/static/all_model_result.csv", index=False)





# In [32]: Bar chart for accuracy comparison and save for web app
plt.figure(figsize=(10,6))
bar_width = 0.35
index = np.arange(len(all_model_result))
plt.bar(index, all_model_result['Train-Accuracy'], bar_width, label='Train Accuracy', color='skyblue')
plt.bar(index + bar_width, all_model_result['Test-Accuracy'], bar_width, label='Test Accuracy', color='salmon')

plt.xlabel('Classifier')
plt.ylabel('Accuracy')
plt.title('Comparison of Classifier Performance')
plt.xticks(index + bar_width/2, all_model_result['Classifier'], rotation=30)
plt.ylim(0, 1.05)
plt.legend()
plt.tight_layout()
plt.savefig("web/static/accuracy_bar.png", dpi=150)
plt.show()

# Save a default chosen model (RandomForest) for web prediction
pickle.dump(rf, open("web/model_for_web.pkl", "wb"))

print("Artifacts saved to web/static/: confusion_matrices.png, accuracy_bar.png, all_model_result.csv")
print("Models saved to web/: scaler.pkl, model_for_web.pkl (RandomForest)")






# Save confusion matrix diagram
plt.savefig("static/confusion_matrix.png", dpi=150)

# Save accuracy bar chart
plt.savefig("static/accuracy_bar.png", dpi=150)

# Save accuracy results table
all_model_result.to_csv("static/all_model_result.csv", index=False)

print("Files saved: confusion_matrix.png, accuracy_bar.png, all_model_result.csv")




