# Import Libraries

In [1]:
# Importing Libraries EDA Libraries:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

# Libraries for Encoding
from sklearn.preprocessing import LabelEncoder
Encode = LabelEncoder()

# Libraries for Standardization
from sklearn.preprocessing import StandardScaler
Sc = StandardScaler()

# linear regression Libraries
# To calculate multicollinearity and VIF 
import statsmodels.api as sm
from scipy import stats

# for model building
from  sklearn.model_selection import train_test_split
from sklearn import linear_model
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score
from sklearn.preprocessing import PolynomialFeatures

# EDA: Cleaning, Preprocessing and Feature Engineering 

In [2]:
# importing dataset
train = pd.read_csv('train_NIR5Yl1.csv')
train.head()

Unnamed: 0,ID,Tag,Reputation,Answers,Username,Views,Upvotes
0,52664,a,3942.0,2.0,155623,7855.0,42.0
1,327662,a,26046.0,12.0,21781,55801.0,1175.0
2,468453,c,1358.0,4.0,56177,8067.0,60.0
3,96996,a,264.0,3.0,168793,27064.0,9.0
4,131465,c,4271.0,4.0,112223,13986.0,83.0


In [3]:
train.info()
# no null values

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 330045 entries, 0 to 330044
Data columns (total 7 columns):
ID            330045 non-null int64
Tag           330045 non-null object
Reputation    330045 non-null float64
Answers       330045 non-null float64
Username      330045 non-null int64
Views         330045 non-null float64
Upvotes       330045 non-null float64
dtypes: float64(4), int64(2), object(1)
memory usage: 17.6+ MB


In [4]:
train.describe()
# upvotes has huge diviation from mean.

Unnamed: 0,ID,Reputation,Answers,Username,Views,Upvotes
count,330045.0,330045.0,330045.0,330045.0,330045.0,330045.0
mean,235748.682789,7773.147,3.917672,81442.888803,29645.07,337.505358
std,136039.418471,27061.41,3.579515,49215.10073,80956.46,3592.441135
min,1.0,0.0,0.0,0.0,9.0,0.0
25%,117909.0,282.0,2.0,39808.0,2594.0,8.0
50%,235699.0,1236.0,3.0,79010.0,8954.0,28.0
75%,353620.0,5118.0,5.0,122559.0,26870.0,107.0
max,471493.0,1042428.0,76.0,175738.0,5231058.0,615278.0


In [5]:
(train[train['Views']>2800000]).count()
# removing sparsed data

ID            21
Tag           21
Reputation    21
Answers       21
Username      21
Views         21
Upvotes       21
dtype: int64

In [6]:
train = train.drop(train[train.Views > 2800000].index)

In [7]:
train['Tag'] = Encode.fit_transform(train['Tag'])
train.drop(['ID','Username'], axis=1,inplace =True)
target = train['Upvotes']

# Check for Multicolinearity


In [8]:
X = train.drop(['Upvotes'],axis=1)
feat_names = [x for x in X]

In [9]:

for i in range(0,len(feat_names)):
    yy = X.loc[:, X.columns == feat_names[i]]
    xx = X.loc[:, X.columns != feat_names[i]]
    model = sm.OLS(yy, xx)
    results = model.fit()
    rsq = results.rsquared
    vif = round(1 / (1 - rsq), 2)
    print(
        "R Square value of {} column is {} keeping all other columns as features".format(
            feat_names[i], (round(rsq, 2))
        ))
    print(
        "Variance Inflation Factor of {} column is {} \n".format(
            feat_names[i], vif)
    )

R Square value of Tag column is 0.34 keeping all other columns as features
Variance Inflation Factor of Tag column is 1.53 

R Square value of Reputation column is 0.07 keeping all other columns as features
Variance Inflation Factor of Reputation column is 1.07 

R Square value of Answers column is 0.54 keeping all other columns as features
Variance Inflation Factor of Answers column is 2.16 

R Square value of Views column is 0.35 keeping all other columns as features
Variance Inflation Factor of Views column is 1.54 



# Model Building

In [10]:
from sklearn.preprocessing import Binarizer
#Creating a threshold 
bn = Binarizer(threshold=7)
#above 7 is considered as 1 and below 7 we are considering as 0
bn_trans = bn.transform([train['Answers']])[0]
train['bn_trans'] = bn_trans

In [11]:
feature_names = [x for x in train.columns if x not in ['Upvotes']]

x_train, x_test, y_train, y_test = train_test_split(train[feature_names], target,test_size = 0.25,random_state =101)
x_train = Sc.fit_transform(x_train)
x_test = Sc.transform(x_test)

In [12]:
poly_reg = PolynomialFeatures(degree = 5,interaction_only=False, include_bias=True)
X_poly = poly_reg.fit_transform(x_train)
poly_reg.fit(x_train, y_train)
lin_reg = linear_model.LassoLars(alpha=0.021,max_iter=151)
lin_reg.fit(X_poly, y_train)

LassoLars(alpha=0.021, copy_X=True, eps=2.220446049250313e-16,
          fit_intercept=True, fit_path=True, max_iter=151, normalize=True,
          positive=False, precompute='auto', verbose=False)

In [13]:
# predicitng 
preds = lin_reg.predict(poly_reg.fit_transform(x_test))

print(r2_score(y_test, preds))

0.9087772429959889


In [14]:
from sklearn import metrics

print('MAE:', metrics.mean_absolute_error(y_test, preds))

MAE: 107.81798769042817


# Applying model to testing

In [15]:
test = pd.read_csv('testupvotes.csv')
ids = test['ID']
test.drop(['ID','Username'], axis=1,inplace =True)
test['Tag'] = Encode.fit_transform(test['Tag'])

In [16]:
bn = Binarizer(threshold=7)
bn_trans = bn.transform([test['Answers']])[0]
test['bn_trans'] = bn_trans

In [17]:
test = Sc.fit_transform(test)
pred_test = lin_reg.predict(poly_reg.fit_transform(test))
pred_test=abs(pred_test)
pred_test=pred_test.astype(int)

In [18]:
Final_submission = pd.DataFrame({'ID': ids,
                           'Upvotes':pred_test
                           })

In [19]:
Final_submission.head(100)

Unnamed: 0,ID,Upvotes
0,366953,206
1,71864,121
2,141692,46
3,316833,35
4,440445,273
5,3514,34
6,331420,15
7,406540,84
8,227379,70
9,331687,28


In [22]:
Final_submission.to_csv("Final_Sub.csv",index=False)