In [3]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import torch
import os, sys
from collections import Counter

In [4]:
!pip install verstack



In [5]:
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import mean_absolute_error
from verstack.stratified_continuous_split import scsplit 

In [6]:
from google.colab import drive
drive.mount("/content/drive")

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [7]:
# Setting up the device for GPU usage

from torch import cuda
device = 'cuda' if cuda.is_available() else 'cpu'
device

'cuda'

In [8]:
path = "/content/drive/My Drive/"
path_data_train = path + "/covidChallenge/data/train.csv"

In [9]:
train_data = pd.read_csv(path_data_train)

In [10]:
from sklearn.svm import SVC
from sklearn import metrics

In [11]:
#Non-relevant features, can not be used for SVMs models
train_data.drop('timestamp', axis=1, inplace=True)
train_data.drop('user_mentions', axis=1, inplace=True)
train_data.drop('urls', axis=1, inplace=True)
train_data.drop('hashtags', axis=1, inplace=True)
train_data.drop('text', axis=1, inplace=True)
train_data.drop('id', axis=1, inplace=True)
train_data.drop('user_verified', axis=1, inplace=True)

In [18]:
nbr = train_data[train_data.retweet_count != 0].shape[0]
nbr1 = train_data[train_data.retweet_count == 0].shape[0]
print("\t Number of rows where retweets != 0: ", nbr)
print("\t Number of rows where retweets == 0: ", nbr1)

	 Number of retweets != 0:  242974
	 Number of retweets == 0:  422803


In [46]:
# Shuffle the Dataset.
shuffled_train = train_data.sample(frac=1, random_state=4)

# Put all the samples where they dont have zeros re-tweet in a separate dataset.
non_zero_retweet = shuffled_train.loc[shuffled_train['retweet_count'] != 0]
nbr_samples = non_zero_retweet.shape[0] // 10  #We will take only sixth of it

#Randomly select samples observations from the zero re-tweet (majority class)
zero_retweet = shuffled_train.loc[shuffled_train['retweet_count'] == 0].sample(n=nbr_samples, random_state=42)

print("Shape of non_zero_retweet: ", non_zero_retweet[:nbr_samples].shape)
print("Shape of zero_retweet: ", zero_retweet.shape)

# Concatenate both dataframes again
normalized_train = pd.concat([non_zero_retweet[:nbr_samples], zero_retweet])
print("Normalized train dataset: ", normalized_train.shape)

Shape of non_zero_retweet:  (24297, 4)
Shape of zero_retweet:  (24297, 4)
Normalized train dataset:  (48594, 4)


In [47]:
nbr = normalized_train[normalized_train.retweet_count != 0].shape[0]
nbr1 = normalized_train[normalized_train.retweet_count == 0].shape[0]
print("\t Number of rows in normalized_train where retweets != 0: ", nbr)
print("\t Number of rows in normalized_train where retweets == 0: ", nbr1)

	 Number of rows in normalized_train where retweets != 0:  24297
	 Number of rows in normalized_train where retweets == 0:  24297


In [48]:
normalized_train.columns

Index(['retweet_count', 'user_statuses_count', 'user_followers_count',
       'user_friends_count'],
      dtype='object')

In [49]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(normalized_train, normalized_train["retweet_count"],
                                                    test_size=0.3, random_state=85)
# We remove the actual number of retweets from our features since it is the value that we are trying to predict
X_train = X_train.drop(['retweet_count'], axis=1)
X_test = X_test.drop(['retweet_count'], axis=1)

In [50]:
X_train.columns
X_test.columns

Index(['user_statuses_count', 'user_followers_count', 'user_friends_count'], dtype='object')

In [51]:
print("\t Train dataset shape: ", X_train.shape)
print("\t Test dataset shape: ", X_test.shape)

	 Train dataset shape:  (34015, 3)
	 Test dataset shape:  (14579, 3)


**Comment:**
> - Storing the kernel matrix requires memory that scales quadratically with the number of data points. Training time for traditional SVM algorithms also scales superlinearly with the number of data points. So, these algorithms aren't feasible for large data sets.

In [None]:
linear_svm = SVC(kernel = 'linear', probability=False)
linear_svm.fit(X_train, y_train)

In [1]:
linear_svm_score = linear_svm.score(x_train_small, y_train_small)
print("\t Train accuracy: ", linear_svm_score)

NameError: ignored

In [2]:
predict = model.predict(X_test)
print("\t Test Linear SVM accuracy: ", predict)
print("\t Prediction error using MAE: ", mean_absolute_error(y_true=y_test, y_pred=predict))

NameError: ignored

--------------------------------------------------------------------------------

Polynomial Support Vector Machines

In [None]:
train_accuracy = []
test_accuracy = []
roc_auc = []
f1_score = []
pr_auc = []
for i in range(2,8):
    model = SVC(kernel = 'poly', probability=False, degree= i)
    model.fit(x_train_small, y_train_small)
    train_accuracy.append(model.score(x_train_small, y_train_small))

    predicted_label = model.predict(x_test_small)

    test_accuracy.append(metrics.accuracy_score(y_test_small, predicted_label)*100)
    #roc_auc.append(metrics.roc_auc_score(y_test_small, predicted_proba))
    f1_score.append(metrics.f1_score(y_test_small, predicted_label))