<h1>Part 4: preliminary modelling</h1>

**Loading necessary libraries and setting display settings**

In [2]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import roc_auc_score, f1_score
from xgboost import XGBClassifier

pd.set_option('display.max_colwidth', None)

**Loading the data**

In [3]:
df = pd.read_csv("train_final.csv")

**Splitting the data**

In [4]:
train_df, test_df = train_test_split(df, test_size=0.3, random_state=2092)

**Feature and target creation**

In [7]:
X_train = train_df.drop(columns=['sentiment'])
y_train = train_df['sentiment']

X_test = test_df.drop(columns=['sentiment'])
y_test = test_df['sentiment']

In [None]:
model = LogisticRegression(max_iter=1000, random_state=42)
model.fit(X_train, y_train)

y_pred = model.predict(X_test)      # Predicted classes
y_prob = model.predict_proba(X_test)  # Predicted probabilities for all classes

f2 = fbeta_score(y_test, y_pred, beta = 2, average=None)  # Weighted F1 Score

print(f"F2 Score: {f2}")

In [None]:
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay

cm = confusion_matrix(y_test, y_pred)
cm

In [None]:
models = {
    "Logistic Regression": LogisticRegression(max_iter=1000, random_state=42),
    "Random Forest": RandomForestClassifier(n_estimators=100, random_state=42),
    "Naive Bayes": GaussianNB(),
    "Decision Tree": DecisionTreeClassifier(random_state=42),
    "XGBoost": XGBClassifier(use_label_encoder=False, eval_metric='mlogloss', random_state=42)
}
results = []


for name, model in models.items():

    print(f"Training {name}")
    # Train model
    model.fit(X_train, y_train)

    print("Done")
    
    # Predictions
    y_pred = model.predict(X_test)      # Class predictions
    
    # Compute metrics
    f1 = fbeta_score(y_test, y_pred, beta=2, average=None) 
    
    # Store results
    results.append({"Model": name, "F2 Score": f1})

#  Save results to DataFrame
results_df = pd.DataFrame(results)

#  Display results
results_df