# Data Mining Final Project

Students: Jose Pujol, Jacob Schuster

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import os

#### 1. Load and Inspection of Data

In [None]:
data = pd.read_csv(os.path.join(os.getcwd(), 'dataset', 'Stock News Dataset.csv'), encoding = "ISO-8859-1")

In [None]:
data.head()

In [None]:
print(f"Number of rows: {data.shape[0]}")
print(f"Number of columns: {data.shape[1]}")

In [None]:
print(f"From {data['Date'].min()} to {data['Date'].max()}")

This dataset contains information on the top 25 headlines for a stock. The first two columns are the date and the label. The label is a 1 if the DJIA adjusted close value rose or stayed the same, and a 0 if it decreased. The remaining columns are the top 25 headlines for that day. The data is from January 2003 to July 2016.

### 2. Data Preprocessing

In [None]:
data.info()

We can see that the date is a string, the label is an integer 0 or 1. And the rest of the columns are strings which are the headlines.

In [None]:
data.isnull().sum()

We only have 3 rows that have null values. We will drop these rows since it is a small amount of data.

In [None]:
data = data.dropna()

The date column is mostly irrelavent and could cause the model to overfit, so let's drop it

In [None]:
data = data.drop(['Date'], axis=1)

In [None]:
data.info()

In [None]:
print(f"Number of rows: {data.shape[0]}")
print(f"Number of columns: {data.shape[1]}")

In [None]:
data.isnull().sum()

In [None]:
data.duplicated().sum()

In [None]:
all_classes = data['Label'].unique()

for class_name in all_classes:
    print(f"Percentage of {class_name} class in dataset: {data[data['Label'] == class_name]['Label'].size/data['Label'].size*100:0.2f}%")

There is not a wide disparity between the class values.

Due to the fact that we are dealing with sequential data here, you cannot just randomly split the data into training and testing sets. We will split the data into training and testing sets based on the date. We will use the first 72% of the data for training and the last 28% for testing.

### 3. Feauture Engineering

In [None]:
from inference_model import inference_model

Convert the textual article headlines to a sentiment score (0-1) using a transformer model

In [None]:
data = inference_model(data)
data.head()


In [None]:
#save the dataframe just in case
data.to_csv(os.path.join(os.getcwd(), 'dataset', 'Stock News Dataset (with sentiment).csv'), index=False)

In [None]:
from sklearn.model_selection import train_test_split
y = data['Label']
X = data.drop(['Label'], axis=1)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
print(f"Number of rows in train dataset: {X_train.shape[0]}, {X_train.shape[0]/data.shape[0]*100:0.2f}%")
print(f"Number of rows in test dataset: {X_test.shape[0]}, {X_test.shape[0]/data.shape[0]*100:0.2f}%")

### 4. Model Selection and Evaluation

In [None]:
from sklearn.model_selection import train_test_split, GridSearchCV, LeaveOneOut
from sklearn.metrics import accuracy_score, log_loss, confusion_matrix, f1_score, precision_score, recall_score, roc_auc_score, roc_curve, classification_report
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier

In [None]:
models_df = pd.DataFrame(columns=[
                                "Model Name",
                                "Notes",
                                "Accuracy (Training)",
                                "Log Loss (Training)",
                                "F1 Score (Training)",
                                "Precision (Training)",
                                "Recall (Training)",
                                "Accuracy (Validation)",
                                "Log Loss (Validation)",
                                "F1 Score (Validation)",
                                "Precision (Validation)",
                                "Recall (Validation)",])
models_df.head()

In [None]:
def add_model_stats(model, model_name, notes, X_train, y_train, X_val, y_val, models_df):
    X_train_predictions = model.predict(X_train)
    X_train_accuracy = accuracy_score(y_train, X_train_predictions)
    print(f"{model_name} validation Accuracy Score on training set: {X_train_accuracy:0.4f}")

    X_val_predictions = model.predict(X_val)
    X_val_accuracy = accuracy_score(y_val, X_val_predictions)
    print(f"{model_name} validation Accuracy Score on validation set: {X_val_accuracy:0.4f}")

    # Get probabilities
    X_train_probabilities = model.predict_proba(X_train)
    X_val_probabilities = model.predict_proba(X_val)

    log_loss_training_set = log_loss(y_train, X_train_probabilities)
    print(f"{model_name} validation Log Loss on validation set: {log_loss_training_set:0.4f}")

    log_loss_validation_set = log_loss(y_val, X_val_probabilities)
    print(f"{model_name} validation Log Loss on validation set: {log_loss_validation_set:0.4f}")

    model_f1_train = f1_score(y_train, X_train_predictions, average='weighted')
    model_precision_train = precision_score(y_train, X_train_predictions, average="weighted")
    model_recall_train = recall_score(y_train, X_train_predictions, average="weighted")

    model_f1_val = f1_score(y_val, X_val_predictions, average='weighted')
    print(f"{model_name} validation F1 Score on validation set: {model_f1_val:0.4f}")
    model_precision_val = precision_score(y_val, X_val_predictions, average="weighted")
    print(f"{model_name} validation Precision Score on validation set: {model_precision_val:0.4f}")
    model_recall_val = recall_score(y_val, X_val_predictions, average="weighted")
    print(f"{model_name} validation Recall Score on validation set: {model_recall_val:0.4f}")

    df_entry = {
            "Model Name": model_name,
            "Notes": notes,
            "Accuracy (Training)": X_train_accuracy,
            "Log Loss (Training)": log_loss_training_set,
            "F1 Score (Training)": model_f1_train,
            "Precision (Training)": model_precision_train,
            "Recall (Training)": model_recall_train,
            "Accuracy (Validation)": X_val_accuracy,
            "Log Loss (Validation)": log_loss_validation_set,
            "F1 Score (Validation)": model_f1_val,
            "Precision (Validation)": model_precision_val,
            "Recall (Validation)": model_recall_val}

    # As of Pandas 2.0 append is deprecated and we have to use concat now, not a fan
    # See this post https://stackoverflow.com/questions/75956209/error-dataframe-object-has-no-attribute-append
    models_df = pd.concat([models_df, pd.DataFrame([df_entry])], ignore_index=True)
    return models_df

In [None]:
logistic_model = LogisticRegression(random_state=42, max_iter=10000)
svc_model_linear = SVC(random_state=42, kernel="linear", probability=True, gamma="auto")
svc_model_rbf = SVC(random_state=42, kernel="rbf", probability=True, gamma="auto")
random_forest_classifier = RandomForestClassifier(random_state=42, n_jobs=-1, n_estimators=200, criterion="entropy")
xgb_classifier = XGBClassifier(random_state=42, n_jobs=-1, n_estimators=200, gamma=0.1, learning_rate=0.1)

In [None]:
logistic_model.fit(X_train, y_train)

In [None]:
predictions = logistic_model.predict(X_test)

In [None]:
matrix=confusion_matrix(y_test, predictions)
print(matrix)
score=accuracy_score(y_test, predictions)
print(score)
report=classification_report(y_test, predictions)
print(report)

In [None]:
svc_model_linear.fit(train, train['Label'])
predictions = svc_model_linear.predict(test_dataset)

In [None]:
matrix=confusion_matrix(test['Label'],predictions)
print(matrix)
score=accuracy_score(test['Label'],predictions)
print(score)
report=classification_report(test['Label'],predictions)
print(report)

In [None]:
svc_model_rbf.fit(train, train['Label'])
predictions = svc_model_rbf.predict(test_dataset)

In [None]:
matrix=confusion_matrix(test['Label'],predictions)
print(matrix)
score=accuracy_score(test['Label'],predictions)
print(score)
report=classification_report(test['Label'],predictions)
print(report)

In [None]:
random_forest_classifier.fit(X_train, y_train)
predictions = random_forest_classifier.predict(X_test)

In [None]:
matrix=confusion_matrix(y_test, predictions)
print(matrix)
score=accuracy_score(y_test, predictions)
print(score)
report=classification_report(y_test, predictions)
print(report)

In [None]:
xgb_classifier.fit(train, train['Label'])
predictions = xgb_classifier.predict(test_dataset)

In [None]:
matrix=confusion_matrix(test['Label'],predictions)
print(matrix)
score=accuracy_score(test['Label'],predictions)
print(score)
report=classification_report(test['Label'],predictions)
print(report)

### 5. Hyperparameter Tuning

### 6. Visualization

### 7. Insights and Analysis

### 8. Validation

### 9. Conclusion