<a href="https://colab.research.google.com/github/IemProg/DataChallengeXINF554/blob/main/Linear_SVR.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import torch
import os, sys
from collections import Counter

In [3]:
!pip install verstack

Collecting verstack
  Downloading https://files.pythonhosted.org/packages/bc/7e/6319afad955211755557db0f30c31a6eddd6cefcc795fec1c27bbb1b5e31/verstack-0.3.1.tar.gz
Building wheels for collected packages: verstack
  Building wheel for verstack (setup.py) ... [?25l[?25hdone
  Created wheel for verstack: filename=verstack-0.3.1-cp36-none-any.whl size=14343 sha256=e74b475c593c8cabae38ec650117958631187a6b84812d245f0ddc6bae94f315
  Stored in directory: /root/.cache/pip/wheels/15/1b/58/10e59516150cea4d9b1dbacceb3bffcc0cfd2d166efabec2f6
Successfully built verstack
Installing collected packages: verstack
Successfully installed verstack-0.3.1


In [4]:
from sklearn.metrics import mean_absolute_error
from verstack.stratified_continuous_split import scsplit 
from sklearn.svm import LinearSVR
from sklearn.pipeline import make_pipeline
from sklearn import metrics
from sklearn import preprocessing
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split

In [11]:
from google.colab import drive
drive.mount("/content/drive")

Mounted at /content/drive


In [12]:
path = "/content/drive/MyDrive/DataSets/"
path_data_train = path + "train.csv"

In [13]:
train_data = pd.read_csv(path_data_train)

In [14]:
#Non-relevant features, can not be used for SVMs models
train_data.drop('timestamp', axis=1, inplace=True)
train_data.drop('user_mentions', axis=1, inplace=True)
train_data.drop('urls', axis=1, inplace=True)
train_data.drop('hashtags', axis=1, inplace=True)
train_data.drop('text', axis=1, inplace=True)
train_data.drop('id', axis=1, inplace=True)
train_data.drop('user_verified', axis=1, inplace=True)

In [15]:
nbr = train_data[train_data.retweet_count != 0].shape[0]
nbr1 = train_data[train_data.retweet_count == 0].shape[0]
print("\t Number of rows where retweets != 0: ", nbr)
print("\t Number of rows where retweets == 0: ", nbr1)

	 Number of rows where retweets != 0:  242974
	 Number of rows where retweets == 0:  422803


In [16]:
# Shuffle the Dataset.
shuffled_train = train_data.sample(frac=1, random_state=4)

# Put all the samples where they dont have zeros re-tweet in a separate dataset.
non_zero_retweet = shuffled_train.loc[shuffled_train['retweet_count'] != 0]
nbr_samples = non_zero_retweet.shape[0] // 1  #We will take only sixth of it

#Randomly select samples observations from the zero re-tweet (majority class)
zero_retweet = shuffled_train.loc[shuffled_train['retweet_count'] == 0].sample(n=nbr_samples, random_state=42)

print("Shape of non_zero_retweet: ", non_zero_retweet[:nbr_samples].shape)
print("Shape of zero_retweet: ", zero_retweet.shape)

# Concatenate both dataframes again
normalized_train = pd.concat([non_zero_retweet[:nbr_samples], zero_retweet])
print("Normalized train dataset: ", normalized_train.shape)

Shape of non_zero_retweet:  (242974, 4)
Shape of zero_retweet:  (242974, 4)
Normalized train dataset:  (485948, 4)


In [17]:
nbr = normalized_train[normalized_train.retweet_count != 0].shape[0]
nbr1 = normalized_train[normalized_train.retweet_count == 0].shape[0]
print("\t Number of rows in normalized_train where retweets != 0: ", nbr)
print("\t Number of rows in normalized_train where retweets == 0: ", nbr1)

	 Number of rows in normalized_train where retweets != 0:  242974
	 Number of rows in normalized_train where retweets == 0:  242974


In [18]:
normalized_train.columns

Index(['retweet_count', 'user_statuses_count', 'user_followers_count',
       'user_friends_count'],
      dtype='object')

In [19]:
min_max_scaler = preprocessing.MinMaxScaler()
X_train_minmax = min_max_scaler.fit_transform(normalized_train)

In [20]:
X_train, X_test, y_train, y_test = train_test_split(normalized_train, normalized_train["retweet_count"],
                                                    test_size=0.3, random_state=85)
# We remove the actual number of retweets from our features since it is the value that we are trying to predict
X_train = X_train.drop(['retweet_count'], axis=1)
X_test = X_test.drop(['retweet_count'], axis=1)

In [21]:
X_train.columns
X_test.columns

Index(['user_statuses_count', 'user_followers_count', 'user_friends_count'], dtype='object')

In [22]:
print("\t Train dataset shape: ", X_train.shape)
print("\t Test dataset shape: ", X_test.shape)

	 Train dataset shape:  (340163, 3)
	 Test dataset shape:  (145785, 3)


**Comment:**
> - Storing the kernel matrix requires memory that scales quadratically with the number of data points. Training time for traditional SVM algorithms also scales superlinearly with the number of data points. So, these algorithms aren't feasible for large data sets.
LinearSVR is Similar to SVR with parameter kernel=’linear’, but implemented in terms of liblinear rather than libsvm, so it has more flexibility in the choice of penalties and loss functions and should scale better to large numbers of samples.

In [23]:
linear_svr = LinearSVR()

In [24]:
linear_svr.fit(X_train, y_train)

LinearSVR(C=1.0, dual=True, epsilon=0.0, fit_intercept=True,
          intercept_scaling=1.0, loss='epsilon_insensitive', max_iter=1000,
          random_state=None, tol=0.0001, verbose=0)

In [25]:
linear_svr_score = linear_svr.score(X_train, y_train)
print("\t Train accuracy: ", linear_svr_score)

	 Train accuracy:  -0.2287145674973965


In [26]:
predict = linear_svr.predict(X_test)
print("\t Test Linear SVM accuracy: ", predict)
print("\t Prediction error using MAE: ", mean_absolute_error(y_true=y_test, y_pred=predict))

	 Test Linear SVM accuracy:  [-0.92539639 -2.19845968 -1.32723158 ...  0.0596802  -3.76149261
 -0.75225277]
	 Prediction error using MAE:  352.4568078714684


In [27]:
path_data_eval = path + "/evaluation.csv"
evaluation = pd.read_csv(path_data_eval)

In [28]:
#Non-relevant features, can not be used for LR models
evaluation.drop('timestamp', axis=1, inplace=True)
evaluation.drop('user_mentions', axis=1, inplace=True)
evaluation.drop('urls', axis=1, inplace=True)
evaluation.drop('hashtags', axis=1, inplace=True)
evaluation.drop('text', axis=1, inplace=True)
evaluation.drop('user_verified', axis=1, inplace=True)

In [29]:
withoutID = evaluation.copy()
withoutID.drop('id', axis=1, inplace=True)

In [30]:
min_max_scaler = preprocessing.MinMaxScaler()
withoutID_minmax = min_max_scaler.fit_transform(withoutID)

In [31]:
kaggleOut  =  linear_svr.predict(withoutID_minmax)

In [32]:
import csv

f = open("linearsvr.csv", "w+")
with open("linearsvr.csv", 'w') as f:
    writer = csv.writer(f)
    writer.writerow(["TweetID", "NoRetweets"])
    for index, prediction in enumerate(kaggleOut):
        print(str(evaluation['id'].iloc[index]) + " ," + str(int(prediction)))
        writer.writerow([str(evaluation['id'].iloc[index]) , str(int(prediction))])

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
946110 ,0
946111 ,0
946112 ,0
946113 ,0
946114 ,0
946115 ,0
946116 ,0
946117 ,0
946118 ,0
946119 ,0
946120 ,0
946121 ,0
946122 ,0
946123 ,0
946124 ,0
946125 ,0
946126 ,0
946127 ,0
946128 ,0
946129 ,0
946130 ,0
946131 ,0
946132 ,0
946133 ,0
946134 ,0
946135 ,0
946136 ,0
946137 ,0
946138 ,0
946139 ,0
946140 ,0
946141 ,0
946142 ,0
946143 ,0
946144 ,0
946145 ,0
946146 ,0
946147 ,0
946148 ,0
946149 ,0
946150 ,0
946151 ,0
946152 ,0
946153 ,0
946154 ,0
946155 ,0
946156 ,0
946157 ,0
946158 ,0
946159 ,0
946160 ,0
946161 ,0
946162 ,0
946163 ,0
946164 ,0
946165 ,0
946166 ,0
946167 ,0
946168 ,0
946169 ,0
946170 ,0
946171 ,0
946172 ,0
946173 ,0
946174 ,0
946175 ,0
946176 ,0
946177 ,0
946178 ,0
946179 ,0
946180 ,0
946181 ,0
946182 ,0
946183 ,0
946184 ,0
946185 ,0
946186 ,0
946187 ,0
946188 ,0
946189 ,0
946190 ,0
946191 ,0
946192 ,0
946193 ,0
946194 ,0
946195 ,0
946196 ,0
946197 ,0
946198 ,0
946199 ,0
946200 ,0
946201 ,0
946202 ,0
94620