## Testing prediction model

In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import StandardScaler
from sklearn.feature_selection import SelectKBest, f_classif

from sklearn.ensemble import GradientBoostingRegressor
from sklearn.linear_model import LogisticRegression
from sklearn.neural_network import MLPRegressor
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.model_selection import cross_val_score

from sklearn.metrics import accuracy_score


In [2]:
# Load dataset
data = pd.read_csv("test_data cs 1.csv")  # Replace with actual dataset path

In [3]:
# Select relevant columns (assume target column is 'target')
features = [col for col in data.columns if col != 'verified']
target = 'verified'

In [4]:
print(data.columns)

Index(['text', 'screen_name', 'user_lang', 'lang', 'time_zone', 'location',
       'verified', 'friends_count', 'compare_text', 'source', 'created_at',
       'favourites_count', 'listed_count', 'statuses_count', 'followers_count',
       'label', 'cred_score', 'eye_truth'],
      dtype='object')


In [5]:
# Clean missing data
data.dropna(inplace=True)

In [6]:
print(data[features].dtypes)

text                 object
screen_name          object
user_lang            object
lang                 object
time_zone            object
location             object
friends_count         int64
compare_text        float64
source               object
created_at           object
favourites_count      int64
listed_count          int64
statuses_count        int64
followers_count       int64
label                object
cred_score            int64
eye_truth           float64
dtype: object


In [7]:
from sklearn.preprocessing import LabelEncoder

for col in data.select_dtypes(include=['object']).columns:
    le = LabelEncoder()
    data[col] = le.fit_transform(data[col])

In [8]:
for col in data[features]:
    data[col] = pd.to_numeric(data[col], errors='coerce')  # Convert invalid strings to NaN
data.dropna(inplace=True)  # Drop rows with NaN values


In [9]:
# Normalize data for Neural Network
scaler = StandardScaler()
data[features] = scaler.fit_transform(data[features])

In [10]:
# Feature selection
selector = SelectKBest(f_classif, k=10)
data_selected = selector.fit_transform(data[features], data[target])

In [11]:
# Split data
X_train, X_test, y_train, y_test = train_test_split(data_selected, data[target], test_size=0.2, random_state=42)

In [None]:
from sklearn.neural_network import MLPClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import classification_report, confusion_matrix

In [None]:
# Train models
models = {
    "Neural Network": MLPClassifier(hidden_layer_sizes=(100,), max_iter=500, random_state=42),
    "Boosted Decision Tree": GradientBoostingClassifier(n_estimators=100, random_state=42),
    "Logistic Regression": LogisticRegression()
}

# Train, score, and evaluate models
for name, model in models.items():
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    cv_score = np.mean(cross_val_score(model, X_train, y_train, cv=5))
    print(f"{name}: Accuracy = {accuracy:.4f}, Cross-Validation Score = {cv_score:.4f}")
    print(classification_report(y_test, y_pred))
    print(confusion_matrix(y_test, y_pred))


Neural Network: Accuracy = 0.9630, Cross-Validation Score = 0.9645
              precision    recall  f1-score   support

           0       0.96      1.00      0.98     19373
           1       0.00      0.00      0.00       743

    accuracy                           0.96     20116
   macro avg       0.48      0.50      0.49     20116
weighted avg       0.93      0.96      0.94     20116

Boosted Decision Tree: Accuracy = 0.9630, Cross-Validation Score = 0.9644
              precision    recall  f1-score   support

           0       0.96      1.00      0.98     19373
           1       0.33      0.00      0.00       743

    accuracy                           0.96     20116
   macro avg       0.65      0.50      0.49     20116
weighted avg       0.94      0.96      0.95     20116

Logistic Regression: Accuracy = 0.9631, Cross-Validation Score = 0.9645
              precision    recall  f1-score   support

           0       0.96      1.00      0.98     19373
           1       0.00 

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
