In [1]:
import numpy as np
import pandas as pd
import nltk
import sklearn

In [2]:
df = pd.read_csv('./train.csv')
df.head()

Unnamed: 0,id,url_legal,license,excerpt,target,standard_error
0,c12129c31,,,When the young people returned to the ballroom...,-0.340259,0.464009
1,85aa80a4c,,,"All through dinner time, Mrs. Fayre was somewh...",-0.315372,0.480805
2,b69ac6792,,,"As Roger had predicted, the snow departed as q...",-0.580118,0.476676
3,dd1000b26,,,And outside before the palace a great garden w...,-1.054013,0.450007
4,37c1b32fb,,,Once upon a time there were Three Bears who li...,0.247197,0.510845


In [3]:
df.nunique()

id                2834
url_legal          667
license             15
excerpt           2834
target            2834
standard_error    2834
dtype: int64

In [4]:
df.isnull().sum()

id                   0
url_legal         2004
license           2004
excerpt              0
target               0
standard_error       0
dtype: int64

In [5]:
df = df[['excerpt','target']]

##### remove punctuations

In [6]:
import string

string.punctuation

'!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~'

In [7]:
def remove_punct(text):
    text = "".join([char for char in text if char not in string.punctuation])
    return text

In [8]:
df['clean_excerpt'] = df['excerpt'].apply(lambda x: remove_punct(x))

##### Tokenizer

In [9]:
import re

def tokenize(text):
    tokens = re.split("\W",text)
    return tokens


In [10]:
df['excerpt_tokens'] = df['clean_excerpt'].apply(lambda x : tokenize(x.lower()))
df.head()

Unnamed: 0,excerpt,target,clean_excerpt,excerpt_tokens
0,When the young people returned to the ballroom...,-0.340259,When the young people returned to the ballroom...,"[when, the, young, people, returned, to, the, ..."
1,"All through dinner time, Mrs. Fayre was somewh...",-0.315372,All through dinner time Mrs Fayre was somewhat...,"[all, through, dinner, time, mrs, fayre, was, ..."
2,"As Roger had predicted, the snow departed as q...",-0.580118,As Roger had predicted the snow departed as qu...,"[as, roger, had, predicted, the, snow, departe..."
3,And outside before the palace a great garden w...,-1.054013,And outside before the palace a great garden w...,"[and, outside, before, the, palace, a, great, ..."
4,Once upon a time there were Three Bears who li...,0.247197,Once upon a time there were Three Bears who li...,"[once, upon, a, time, there, were, three, bear..."


##### Remove Stopwords

In [11]:
import requests
stopwords_list = requests.get("https://gist.githubusercontent.com/rg089/35e00abf8941d72d419224cfd5b5925d/raw/12d899b70156fd0041fa9778d657330b024b959c/stopwords.txt").content
stopwords = set(stopwords_list.decode().splitlines()) 

In [12]:
def remove_stopwords(tokens):
    no_stopwords = [word for word in tokens if word not in stopwords]
    return no_stopwords

In [13]:
df['without_stopwords'] = df['excerpt_tokens'].apply(lambda x:remove_stopwords(x))
df.head()

Unnamed: 0,excerpt,target,clean_excerpt,excerpt_tokens,without_stopwords
0,When the young people returned to the ballroom...,-0.340259,When the young people returned to the ballroom...,"[when, the, young, people, returned, to, the, ...","[young, people, returned, ballroom, presented,..."
1,"All through dinner time, Mrs. Fayre was somewh...",-0.315372,All through dinner time Mrs Fayre was somewhat...,"[all, through, dinner, time, mrs, fayre, was, ...","[dinner, time, fayre, silent, eyes, resting, d..."
2,"As Roger had predicted, the snow departed as q...",-0.580118,As Roger had predicted the snow departed as qu...,"[as, roger, had, predicted, the, snow, departe...","[roger, predicted, snow, departed, days, sleig..."
3,And outside before the palace a great garden w...,-1.054013,And outside before the palace a great garden w...,"[and, outside, before, the, palace, a, great, ...","[palace, great, garden, walled, round, filled,..."
4,Once upon a time there were Three Bears who li...,0.247197,Once upon a time there were Three Bears who li...,"[once, upon, a, time, there, were, three, bear...","[time, bears, lived, house, wood, small, wee, ..."


##### Vectorize

In [14]:
def clean_text(text):
    text = "".join([char.lower() for char in text if char not in string.punctuation ])
    tokens = re.split("\W",text)
    no_stopwords = [word for word in tokens if word not in stopwords]
    return no_stopwords

In [15]:
from sklearn.feature_extraction.text import TfidfVectorizer
tfidf_vect = TfidfVectorizer(analyzer=clean_text)
X_tfidf = tfidf_vect.fit_transform(df['excerpt'])

In [16]:
X_features = pd.DataFrame(X_tfidf.toarray())

In [17]:
X_features.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,28470,28471,28472,28473,28474,28475,28476,28477,28478,28479
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


##### Model Building

In [18]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression,Ridge
from sklearn.ensemble import RandomForestRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.metrics import mean_squared_error as mse

In [19]:
X_train,X_test,y_train,y_test = train_test_split(X_features,df['target'],test_size=0.3,random_state=20)

In [20]:
lr = LinearRegression()
ridge = Ridge(fit_intercept= True, normalize=False)
rf = RandomForestRegressor(n_estimators = 60)
knn = KNeighborsRegressor()

m = [ridge,lr,rf,knn]
mn= ['Ridge Regression','Linear Regression','Random Forest','KNN']

In [21]:
for i in range(len(m)):
    model = m[i].fit(X_train,y_train)
    y_pred = model.predict(X_test)

    MSE = mse(y_test,y_pred)
    print('model:',mn[i])
    print('MSE:',MSE)


model: Ridge Regression
MSE: 0.6237624354226435
model: Linear Regression
MSE: 3.12986255571913e+23
model: Random Forest
MSE: 0.6859185046077865
model: KNN
MSE: 0.8929140228180514
