<a href="https://colab.research.google.com/github/IemProg/DataChallengeXINF554/blob/main/SVR_RBFKernel.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import torch
import os, sys
from collections import Counter

In [3]:
from sklearn.metrics import mean_absolute_error
from verstack.stratified_continuous_split import scsplit 
from sklearn.svm import SVR
from sklearn.pipeline import make_pipeline
from sklearn import metrics
from sklearn import preprocessing
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split

In [4]:
from google.colab import drive
drive.mount("/content/drive")

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [6]:
path = "/content/drive/MyDrive/DataSets/"
path_data_train = path + "train.csv"

In [7]:
train_data = pd.read_csv(path_data_train)

In [9]:
#Non-relevant features, can not be used for SVMs models
train_data.drop('timestamp', axis=1, inplace=True)
train_data.drop('user_mentions', axis=1, inplace=True)
train_data.drop('urls', axis=1, inplace=True)
train_data.drop('hashtags', axis=1, inplace=True)
train_data.drop('text', axis=1, inplace=True)
train_data.drop('id', axis=1, inplace=True)
train_data.drop('user_verified', axis=1, inplace=True)

In [10]:
nbr = train_data[train_data.retweet_count != 0].shape[0]
nbr1 = train_data[train_data.retweet_count == 0].shape[0]
print("\t Number of rows where retweets != 0: ", nbr)
print("\t Number of rows where retweets == 0: ", nbr1)

	 Number of rows where retweets != 0:  242974
	 Number of rows where retweets == 0:  422803


In [11]:
# Shuffle the Dataset.
shuffled_train = train_data.sample(frac=1, random_state=4)

# Put all the samples where they dont have zeros re-tweet in a separate dataset.
non_zero_retweet = shuffled_train.loc[shuffled_train['retweet_count'] != 0]
nbr_samples = non_zero_retweet.shape[0] // 1  #We will take only sixth of it

#Randomly select samples observations from the zero re-tweet (majority class)
zero_retweet = shuffled_train.loc[shuffled_train['retweet_count'] == 0].sample(n=nbr_samples, random_state=42)

print("Shape of non_zero_retweet: ", non_zero_retweet[:nbr_samples].shape)
print("Shape of zero_retweet: ", zero_retweet.shape)

# Concatenate both dataframes again
normalized_train = pd.concat([non_zero_retweet[:nbr_samples], zero_retweet])
print("Normalized train dataset: ", normalized_train.shape)

Shape of non_zero_retweet:  (242974, 4)
Shape of zero_retweet:  (242974, 4)
Normalized train dataset:  (485948, 4)


In [12]:
nbr = normalized_train[normalized_train.retweet_count != 0].shape[0]
nbr1 = normalized_train[normalized_train.retweet_count == 0].shape[0]
print("\t Number of rows in normalized_train where retweets != 0: ", nbr)
print("\t Number of rows in normalized_train where retweets == 0: ", nbr1)

	 Number of rows in normalized_train where retweets != 0:  242974
	 Number of rows in normalized_train where retweets == 0:  242974


In [13]:
normalized_train.columns

Index(['retweet_count', 'user_statuses_count', 'user_followers_count',
       'user_friends_count'],
      dtype='object')

In [14]:
min_max_scaler = preprocessing.MinMaxScaler()
X_train_minmax = min_max_scaler.fit_transform(normalized_train)

In [15]:
X_train, X_test, y_train, y_test = train_test_split(normalized_train, normalized_train["retweet_count"],
                                                    test_size=0.3, random_state=85)
# We remove the actual number of retweets from our features since it is the value that we are trying to predict
X_train = X_train.drop(['retweet_count'], axis=1)
X_test = X_test.drop(['retweet_count'], axis=1)

In [16]:
X_train.columns
X_test.columns

Index(['user_statuses_count', 'user_followers_count', 'user_friends_count'], dtype='object')

In [17]:
print("\t Train dataset shape: ", X_train.shape)
print("\t Test dataset shape: ", X_test.shape)

	 Train dataset shape:  (340163, 3)
	 Test dataset shape:  (145785, 3)


**Comment:**
> - Storing the kernel matrix requires memory that scales quadratically with the number of data points. Training time for traditional SVM algorithms also scales superlinearly with the number of data points. So, these algorithms aren't feasible for large data sets.

In [18]:
svr = SVR(kernel='rbf')

In [None]:
svr.fit(X_train, y_train)

In [None]:
svr_score = svr.score(X_train, y_train)
print("\t Train accuracy: ", svr_score)

In [None]:
predict = svr.predict(X_test)
print("\t Test Linear SVM accuracy: ", predict)
print("\t Prediction error using MAE: ", mean_absolute_error(y_true=y_test, y_pred=predict))

In [None]:
path_data_eval = path + "/evaluation.csv"
evaluation = pd.read_csv(path_data_eval)

In [None]:
#Non-relevant features, can not be used for LR models
evaluation.drop('timestamp', axis=1, inplace=True)
evaluation.drop('user_mentions', axis=1, inplace=True)
evaluation.drop('urls', axis=1, inplace=True)
evaluation.drop('hashtags', axis=1, inplace=True)
evaluation.drop('text', axis=1, inplace=True)
evaluation.drop('user_verified', axis=1, inplace=True)

In [None]:
withoutID = evaluation.copy()
withoutID.drop('id', axis=1, inplace=True)

In [None]:
min_max_scaler = preprocessing.MinMaxScaler()
withoutID_minmax = min_max_scaler.fit_transform(withoutID)

In [None]:
kaggleOut  =  svr.predict(withoutID_minmax)

In [None]:
import csv

f = open("svr_rbf.csv", "w+")
with open("svr_rbf.csv", 'w') as f:
    writer = csv.writer(f)
    writer.writerow(["TweetID", "NoRetweets"])
    for index, prediction in enumerate(kaggleOut):
        print(str(evaluation['id'].iloc[index]) + " ," + str(int(prediction)))
        writer.writerow([str(evaluation['id'].iloc[index]) , str(int(prediction))])