In [1]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
import pandas as pd
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
import numpy as np
from tensorflow.keras.optimizers import Adam
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout, BatchNormalization
from tensorflow.keras.callbacks import EarlyStopping
from sklearn.utils import shuffle
from sklearn.exceptions import NotFittedError
import joblib




In [2]:
df = pd.read_csv("doc_vectorsLabeled.csv")

## Extracting the Labeled Data

In [3]:
df_labeled = df.head(2001)
X = df_labeled.iloc[:, 1:-1]  # Features
y = df_labeled['support']  # Labels

In [7]:
df_labeled.tail

<bound method NDFrame.tail of       Unnamed: 0     dim_0     dim_1     dim_2     dim_3     dim_4     dim_5  \
0              0 -0.057990  0.055985  0.002570  0.025762  0.052390 -0.010970   
1              1 -0.034268 -0.048958 -0.065448 -0.067160 -0.062757  0.045409   
2              2 -0.089613  0.084065  0.011270  0.016617  0.072491 -0.026219   
3              3 -0.030738  0.047998 -0.013283  0.021846  0.031084 -0.038226   
4              4 -0.175887  0.176201  0.020011  0.047555  0.122551 -0.040481   
...          ...       ...       ...       ...       ...       ...       ...   
1996        1996  0.041768 -0.018960  0.010593 -0.012302  0.003654 -0.002383   
1997        1997 -0.002870 -0.009623  0.014463  0.001875 -0.010985 -0.013238   
1998        1998 -0.009619 -0.000421  0.000033  0.006329  0.006559 -0.003828   
1999        1999  0.075217 -0.073033 -0.014893 -0.017576 -0.016714 -0.001446   
2000        2000  0.001953 -0.001311 -0.017945  0.004201 -0.012188  0.012024   

         

## Comparing Models

* Random Forest
* SVM
* NeuralNetworkModel
* KNN
* Gaussian Naive Bayes

In [12]:
class ModelEvaluator:
    def __init__(self, model, X, y, test_size=0.2, random_state=42):
        try:
            self.model = model
            self.X_train, self.X_test, self.y_train, self.y_test = train_test_split(X, y, test_size=test_size, random_state=random_state)
            self.X_train, self.y_train = shuffle(self.X_train, self.y_train)
        except Exception as e:
            print(f"Error during initialization: {e}")
            # Handle the exception as needed, e.g., logging, raising a specific exception, etc.
        finally:
            # Code that should always run after try or except
            pass

    def train_evaluate(self):
        try:
            self.model.fit(self.X_train, self.y_train)
            y_pred = self.model.predict(self.X_test)
            accuracy = accuracy_score(self.y_test, y_pred)
            return accuracy
        except Exception as e:
            print(f"Error during training and evaluation: {e}")
            return None 

        
class RandomForestModel:
    def __init__(self, n_estimators=100, random_state=42):
        self.model = RandomForestClassifier(n_estimators=n_estimators, random_state=random_state)

class SVMModel:
    def __init__(self):
        self.model = SVC()

class KNNModel:
    def __init__(self, n_neighbors=20):
        self.model = KNeighborsClassifier(n_neighbors=n_neighbors)

class NeuralNetworkModel:
    def __init__(self):
        self.model = Sequential()

    def build_model(self, input_dim):
        self.model.add(Dense(units=128, activation='relu', input_dim=input_dim))
        self.model.add(BatchNormalization())
        self.model.add(Dropout(0.3))
        self.model.add(Dense(units=64, activation='relu'))
        self.model.add(Dropout(0.3))
        self.model.add(Dense(units=32, activation='relu'))
        self.model.add(Dropout(0.3))
        self.model.add(Dense(units=1, activation='sigmoid'))
        self.model.compile(optimizer=Adam(learning_rate=0.001), loss='binary_crossentropy', metrics=['accuracy'])
        return self.model

    def train_evaluate(self, X_train, y_train, X_test, y_test, epochs=10, batch_size=64, validation_split=0.3):
        early_stopping = EarlyStopping(monitor='val_loss', patience=5, restore_best_weights=True)
        self.model.fit(X_train, y_train, epochs=epochs, batch_size=batch_size, validation_split=validation_split, callbacks=[early_stopping])
        y_pred_proba = self.model.predict(X_test)
        y_pred = (y_pred_proba > 0.5).astype(int)
        y_pred = y_pred.flatten()
        accuracy = accuracy_score(y_test, y_pred)
        return accuracy

class GaussianNBModel:
    def __init__(self):
        self.model = GaussianNB() 

In [14]:
X = np.require(X, requirements=['C'])
# Random Forest
rf_model = RandomForestModel()
rf_evaluator = ModelEvaluator(rf_model.model, X, y)
rf_accuracy = rf_evaluator.train_evaluate()
print(f'Random Forest Accuracy: {rf_accuracy}')

# SVM
svm_model = SVMModel()
svm_evaluator = ModelEvaluator(svm_model.model, X, y)
svm_accuracy = svm_evaluator.train_evaluate()
print(f'SVM Accuracy: {svm_accuracy}')

# KNN
knn_model = KNNModel()
# Split the labeled data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialize the KNN model
model = KNeighborsClassifier(n_neighbors=5)

# Train the model
model.fit(X_train, y_train)

# Make predictions on the test set
y_pred = model.predict(X_test)

# Evaluate the model performance
accuracy = accuracy_score(y_test, y_pred)
print(f'Accuracy: {accuracy}')
knn_evaluator = ModelEvaluator(knn_model.model, X, y)
knn_accuracy = knn_evaluator.train_evaluate()
print(f'KNN Accuracy: {knn_accuracy}')

# Neural Network
nn_model = NeuralNetworkModel()
X_train_nn, X_test_nn, y_train_nn, y_test_nn = train_test_split(X, y, test_size=0.3, random_state=42)
scaler = StandardScaler()
X_train_nn = scaler.fit_transform(X_train_nn)
X_test_nn = scaler.transform(X_test_nn)
nn_model.build_model(X_train_nn.shape[1])
nn_accuracy = nn_model.train_evaluate(X_train_nn, y_train_nn, X_test_nn, y_test_nn)
print(f'Neural Network Accuracy: {nn_accuracy}')

# Gaussian Naive Bayes
gnb_model = GaussianNBModel()
gnb_evaluator = ModelEvaluator(gnb_model.model, X, y)
gnb_accuracy = gnb_evaluator.train_evaluate()
print(f'Gaussian Naive Bayes Accuracy: {gnb_accuracy}')


Random Forest Accuracy: 0.5860349127182045
SVM Accuracy: 0.5835411471321695
Accuracy: 0.4763092269326683
KNN Accuracy: 0.5785536159600998
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Neural Network Accuracy: 0.4841930116472546
Gaussian Naive Bayes Accuracy: 0.5685785536159601


### Using Random Forest to predict 

In [18]:
model_filename = 'random_forest_modelTweets.joblib'
joblib.dump(rf_model.model, model_filename)
print(f'Trained Random Forest model saved to {model_filename}')

Trained Random Forest model saved to random_forest_modelTweets.joblib


In [23]:
df_unlabeled = df.iloc[2001:, :]

In [29]:
df_unlabeled

Unnamed: 0,dim_0,dim_1,dim_2,dim_3,dim_4,dim_5,dim_6,dim_7,dim_8,dim_9,...,dim_90,dim_91,dim_92,dim_93,dim_94,dim_95,dim_96,dim_97,dim_98,dim_99
2001,-0.011409,0.017817,0.000373,0.002293,0.020791,-0.007415,-0.003678,0.004083,0.003181,-0.015718,...,-0.002850,-0.007854,0.012894,0.003590,0.021470,0.015965,0.005742,-0.031620,-0.007543,0.011795
2002,-0.048148,0.050953,0.016933,0.011151,0.047589,-0.018298,-0.031907,0.022250,-0.020716,-0.055782,...,-0.023346,0.005062,0.002659,0.027743,0.063972,0.068428,0.019125,-0.081859,-0.010699,0.003827
2003,0.005060,-0.007687,0.011518,-0.001099,0.006345,0.001604,0.005980,-0.012611,0.009210,-0.004440,...,0.003746,-0.002790,-0.013111,0.005626,0.002465,-0.012269,0.001375,-0.000010,-0.017494,-0.003267
2004,0.038747,-0.042332,0.011826,-0.006115,0.001726,0.016052,0.011315,-0.016276,0.007469,-0.005949,...,0.019898,0.026572,-0.001119,-0.017220,-0.000459,-0.019872,0.002720,0.019633,0.009849,-0.007121
2005,-0.026658,0.032831,0.002939,-0.001465,0.021367,0.006096,-0.034557,0.010107,-0.006545,-0.000379,...,-0.014690,-0.005522,0.010513,0.015372,0.030871,0.059793,0.006668,-0.057677,-0.004412,-0.007501
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
10479,-0.007406,0.012106,0.015260,0.013355,-0.004694,0.016298,-0.013631,0.002034,-0.016105,0.009280,...,-0.016776,-0.009949,0.001809,0.009280,0.003805,0.009439,0.005481,0.002766,0.005354,-0.003273
10480,-0.019942,0.032489,0.007699,0.008739,0.022398,-0.009767,-0.009472,0.022057,-0.022520,-0.014389,...,-0.016480,-0.001691,0.004511,0.020923,0.042457,0.055520,0.010976,-0.043073,-0.003421,0.009873
10481,-0.022353,0.001433,0.003119,-0.000876,0.005436,-0.004996,-0.009791,0.008593,-0.013453,-0.002464,...,0.002410,0.013088,-0.000786,0.006808,0.010075,0.019256,-0.005208,-0.027546,-0.008511,-0.005886
10482,-0.071252,0.088967,0.016232,0.023419,0.073283,-0.030827,-0.048934,0.064556,-0.028185,-0.081514,...,-0.013867,0.000046,0.027804,0.059626,0.108564,0.124138,0.024654,-0.147103,-0.017194,0.012069


In [30]:
# RANDON FOREST to predict labels for the unlabeled data
predicted_labels = rf_model.model.predict(df_unlabeled)
df_unlabeled['predicted_support'] = predicted_labels




In [31]:
df_unlabeled

Unnamed: 0,dim_0,dim_1,dim_2,dim_3,dim_4,dim_5,dim_6,dim_7,dim_8,dim_9,...,dim_91,dim_92,dim_93,dim_94,dim_95,dim_96,dim_97,dim_98,dim_99,predicted_support
2001,-0.011409,0.017817,0.000373,0.002293,0.020791,-0.007415,-0.003678,0.004083,0.003181,-0.015718,...,-0.007854,0.012894,0.003590,0.021470,0.015965,0.005742,-0.031620,-0.007543,0.011795,1.0
2002,-0.048148,0.050953,0.016933,0.011151,0.047589,-0.018298,-0.031907,0.022250,-0.020716,-0.055782,...,0.005062,0.002659,0.027743,0.063972,0.068428,0.019125,-0.081859,-0.010699,0.003827,1.0
2003,0.005060,-0.007687,0.011518,-0.001099,0.006345,0.001604,0.005980,-0.012611,0.009210,-0.004440,...,-0.002790,-0.013111,0.005626,0.002465,-0.012269,0.001375,-0.000010,-0.017494,-0.003267,1.0
2004,0.038747,-0.042332,0.011826,-0.006115,0.001726,0.016052,0.011315,-0.016276,0.007469,-0.005949,...,0.026572,-0.001119,-0.017220,-0.000459,-0.019872,0.002720,0.019633,0.009849,-0.007121,1.0
2005,-0.026658,0.032831,0.002939,-0.001465,0.021367,0.006096,-0.034557,0.010107,-0.006545,-0.000379,...,-0.005522,0.010513,0.015372,0.030871,0.059793,0.006668,-0.057677,-0.004412,-0.007501,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
10479,-0.007406,0.012106,0.015260,0.013355,-0.004694,0.016298,-0.013631,0.002034,-0.016105,0.009280,...,-0.009949,0.001809,0.009280,0.003805,0.009439,0.005481,0.002766,0.005354,-0.003273,1.0
10480,-0.019942,0.032489,0.007699,0.008739,0.022398,-0.009767,-0.009472,0.022057,-0.022520,-0.014389,...,-0.001691,0.004511,0.020923,0.042457,0.055520,0.010976,-0.043073,-0.003421,0.009873,1.0
10481,-0.022353,0.001433,0.003119,-0.000876,0.005436,-0.004996,-0.009791,0.008593,-0.013453,-0.002464,...,0.013088,-0.000786,0.006808,0.010075,0.019256,-0.005208,-0.027546,-0.008511,-0.005886,1.0
10482,-0.071252,0.088967,0.016232,0.023419,0.073283,-0.030827,-0.048934,0.064556,-0.028185,-0.081514,...,0.000046,0.027804,0.059626,0.108564,0.124138,0.024654,-0.147103,-0.017194,0.012069,1.0


In [32]:
df_unlabeled.to_csv("Predictions.csv")