__Imports__

In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
import nltk
from nltk.tokenize import word_tokenize
#nltk.download()

__Read in Data__

In [2]:
df_1 = pd.read_csv("data/train.csv")
df_1.head()

Unnamed: 0,id,anchor,target,context,score
0,37d61fd2272659b1,abatement,abatement of pollution,A47,0.5
1,7b9652b17b68b7a4,abatement,act of abating,A47,0.75
2,36d72442aefd8232,abatement,active catalyst,A47,0.25
3,5296b0c19e1ce60e,abatement,eliminating process,A47,0.5
4,54c1e3b9184cb5b6,abatement,forest region,A47,0.0


In [3]:
df_1.describe()

Unnamed: 0,score
count,36473.0
mean,0.362062
std,0.258335
min,0.0
25%,0.25
50%,0.25
75%,0.5
max,1.0


In [4]:
df_1.isnull().any()

id         False
anchor     False
target     False
context    False
score      False
dtype: bool

__This is the amount of words in each row with respect to the column__
* Must be applied to anchor and target

These are the unique amount of words in __anchor__ column max is 5

In [5]:
print(np.unique(np.array(df_1['anchor'].apply(lambda x : len(x.split())))))

[1 2 3 4 5]


These are the unique amount of words in __target__ column max is 15

In [6]:
print(np.unique(np.array(df_1['target'].apply(lambda x : len(x.split())))))

[ 1  2  3  4  5  6  7  8  9 10 11 12 13 15]


We can drop __context__ because its a label

In [7]:
print(np.unique(np.array(df_1['context'].apply(lambda x : len(x.split())))))

[1]


__Tokenization__

In [8]:
from nltk.tokenize import word_tokenize
df_1['target'] = df_1['target'].apply(lambda x : word_tokenize(x))
df_1['anchor'] = df_1['anchor'].apply(lambda x : word_tokenize(x))

In [10]:
df_1.head()

Unnamed: 0,id,anchor,target,context,score
0,37d61fd2272659b1,[abatement],"[abatement, of, pollution]",A47,0.5
1,7b9652b17b68b7a4,[abatement],"[act, of, abating]",A47,0.75
2,36d72442aefd8232,[abatement],"[active, catalyst]",A47,0.25
3,5296b0c19e1ce60e,[abatement],"[eliminating, process]",A47,0.5
4,54c1e3b9184cb5b6,[abatement],"[forest, region]",A47,0.0


__Stopwords Removal__

In [11]:
from nltk.corpus import stopwords
stop_words = set(stopwords.words('english'))
df_1['target'] = df_1['target'].apply(lambda x : [w for w in x if not w in stop_words])
df_1['anchor'] = df_1['anchor'].apply(lambda x : [w for w in x if not w in stop_words])

In [12]:
df_1.head()

Unnamed: 0,id,anchor,target,context,score
0,37d61fd2272659b1,[abatement],"[abatement, pollution]",A47,0.5
1,7b9652b17b68b7a4,[abatement],"[act, abating]",A47,0.75
2,36d72442aefd8232,[abatement],"[active, catalyst]",A47,0.25
3,5296b0c19e1ce60e,[abatement],"[eliminating, process]",A47,0.5
4,54c1e3b9184cb5b6,[abatement],"[forest, region]",A47,0.0


__Stemming__

In [13]:
from nltk.stem.snowball import SnowballStemmer
snowBallStemmer = SnowballStemmer("english")
df_1['target'] = df_1['target'].apply(lambda x : [snowBallStemmer.stem(word) for word in x])
df_1['anchor'] = df_1['anchor'].apply(lambda x: [snowBallStemmer.stem(word) for word in x])

In [14]:
df_1.head()

Unnamed: 0,id,anchor,target,context,score
0,37d61fd2272659b1,[abat],"[abat, pollut]",A47,0.5
1,7b9652b17b68b7a4,[abat],"[act, abat]",A47,0.75
2,36d72442aefd8232,[abat],"[activ, catalyst]",A47,0.25
3,5296b0c19e1ce60e,[abat],"[elimin, process]",A47,0.5
4,54c1e3b9184cb5b6,[abat],"[forest, region]",A47,0.0


__Need to convert array to just strings__

In [15]:
df_1['anchor'] = df_1['anchor'].apply(lambda x : ','.join(map(str,x)))
df_1['target'] = df_1['target'].apply(lambda x : ','.join(map(str,x)))

In [17]:
df_1.head()

Unnamed: 0,id,anchor,target,context,score
0,37d61fd2272659b1,abat,"abat,pollut",A47,0.5
1,7b9652b17b68b7a4,abat,"act,abat",A47,0.75
2,36d72442aefd8232,abat,"activ,catalyst",A47,0.25
3,5296b0c19e1ce60e,abat,"elimin,process",A47,0.5
4,54c1e3b9184cb5b6,abat,"forest,region",A47,0.0


These the the unique words in the __anchor__ column 10 are shown below

In [18]:
np.unique(np.array(df_1['anchor']))[:10]

array(['abat', 'abnorm,posit', 'absorb,properti', 'acan', 'accept,inform',
       'achiev,authent', 'acid,absorpt', 'ack', 'acoustoopt,modul',
       'acryl,group'], dtype=object)

These the the unique words in the __target__ column 10 are shown below

In [19]:
np.unique(np.array(df_1['target']))[:10]

array(['', '1,amino,2,methoxyethan', '1,azabicyclo', '1,bromopropan',
       '1,methoxi,2,aminoethan', '10,bit,dac',
       '111444,hexafluoro,2,buten', '12,dibromopropan',
       '120,degre,interv', '12cao7al2o3'], dtype=object)

We have the issue of words going together, we need to seperate these so we can
we need to seperate the lengths

In [219]:
anchor_set = set()
target_set = set()

In [220]:
def separator_anchor(arr):
    for i in arr:
        anchor_set.add(i)

In [221]:
def separator_target(arr):
    for i in arr:
        target_set.add(i)

In [222]:
df_1['anchor'] = df_1['anchor'].apply(lambda x: x.split(','))
df_1['target'] = df_1['target'].apply(lambda x: x.split(','))

In [223]:
df_1['anchor'].apply(lambda x: separator_anchor(x))
df_1['target'].apply(lambda x: separator_target(x))

0        None
1        None
2        None
3        None
4        None
         ... 
36468    None
36469    None
36470    None
36471    None
36472    None
Name: target, Length: 36473, dtype: object

__list comprehension__

In [224]:
anchor_arr = [i for i in anchor_set]
target_arr = [i for i in target_set]

__Label Encoding__

In [225]:
labelencoder_anchor = LabelEncoder()
labelencoder_anchor.fit(anchor_arr)
labelencoder_target = LabelEncoder()
labelencoder_target.fit(target_arr)

LabelEncoder()

In [226]:
def encoding_anchor(arr):
    temp = []
    if len(arr) < 2:
        for i in arr:
            #print(labelencoder_anchor.transform([i]))
            return labelencoder_anchor.transform([i])
    if len(arr) > 1:
        for i in arr:
            #print(labelencoder_anchor.transform([i]))
            temp.extend(labelencoder_anchor.transform([i]))
        return temp

In [227]:
def encoding_target(arr):
    temp = []
    if len(arr) < 2:
        for i in arr:
            #print(labelencoder_anchor.transform([i]))
            return labelencoder_target.transform([i])
    if len(arr) > 1:
        for i in arr:
            #print(labelencoder_anchor.transform([i]))
            temp.extend(labelencoder_target.transform([i]))
        return temp

In [228]:
print("encoding anchor")

encoding anchor


In [229]:
df_1['anchor'] = df_1['anchor'].apply(lambda x: encoding_anchor(x))

In [230]:
print("encoding target")

encoding target


In [231]:
df_1['target'] = df_1['target'].apply(lambda x: encoding_target(x))

In [232]:
print(df_1.head())

                 id anchor        target context  score
0  37d61fd2272659b1    [0]    [35, 4113]     A47   0.50
1  7b9652b17b68b7a4    [0]      [84, 35]     A47   0.75
2  36d72442aefd8232    [0]     [86, 852]     A47   0.25
3  5296b0c19e1ce60e    [0]  [1798, 4260]     A47   0.50
4  54c1e3b9184cb5b6    [0]  [2177, 4500]     A47   0.00


In [233]:
df_1.head()

Unnamed: 0,id,anchor,target,context,score
0,37d61fd2272659b1,[0],"[35, 4113]",A47,0.5
1,7b9652b17b68b7a4,[0],"[84, 35]",A47,0.75
2,36d72442aefd8232,[0],"[86, 852]",A47,0.25
3,5296b0c19e1ce60e,[0],"[1798, 4260]",A47,0.5
4,54c1e3b9184cb5b6,[0],"[2177, 4500]",A47,0.0


5 words max is anchor
15 words max is target

In [343]:
l1 = []
l2 = []
l3 = []
l4 = []
l5 = []

for i in np.array(df_1['anchor']):
    if len(i) == 1:
        l1.append(i[0])
        l2.append(np.nan)
        l3.append(np.nan)
        l4.append(np.nan)
        l5.append(np.nan)
    if len(i) == 2:
        l1.append(i[0])
        l2.append(i[1])
        l3.append(np.nan)
        l4.append(np.nan)
        l5.append(np.nan)
    if len(i) == 3:
        l1.append(i[0])
        l2.append(i[1])
        l3.append(i[2])
        l4.append(np.nan)
        l5.append(np.nan)
    if len(i) == 4:
        l1.append(i[0])
        l2.append(i[1])
        l3.append(i[2])
        l4.append(i[3])
        l5.append(np.nan)
    if len(i) == 5:
        l1.append(i[0])
        l2.append(i[1])
        l3.append(i[2])
        l4.append(i[3])
        l5.append(i[4])

In [348]:
df_1['anchor_1'] = np.array(l1)
df_1['anchor_2'] = np.array(l2)
df_1['anchor_3'] = np.array(l3)
df_1['anchor_4'] = np.array(l4)
df_1['anchor_5'] = np.array(l5)

In [349]:
df_1

Unnamed: 0,id,anchor,target,context,score,anchor_1,anchor_2,anchor_3,anchor_4,anchor_5
0,37d61fd2272659b1,[0],"[35, 4113]",A47,0.50,0,,,,
1,7b9652b17b68b7a4,[0],"[84, 35]",A47,0.75,0,,,,
2,36d72442aefd8232,[0],"[86, 852]",A47,0.25,0,,,,
3,5296b0c19e1ce60e,[0],"[1798, 4260]",A47,0.50,0,,,,
4,54c1e3b9184cb5b6,[0],"[2177, 4500]",A47,0.00,0,,,,
...,...,...,...,...,...,...,...,...,...,...
36468,8e1386cbefd7f245,"[935, 65]","[6035, 352]",B44,1.00,935,65.0,,,
36469,42d9e032d1cd3242,"[935, 65]","[6035, 667]",B44,0.50,935,65.0,,,
36470,208654ccb9e14fa3,"[935, 65]","[6035, 2431]",B44,0.50,935,65.0,,,
36471,756ec035e694722b,"[935, 65]","[6035, 3259]",B44,0.75,935,65.0,,,


In [350]:
# df_1.fillna(36473,inplace=True)

In [351]:
# df_1[df_1['anchor_5']!=36473.0]

In [328]:
# df_1.drop(df_1.columns[5:],axis=1,inplace=True)

In [352]:
l1 = []
l2 = []
l3 = []
l4 = []
l5 = []
l6 = []
l7 = []
l8 = []
l9 = []
l10 = []
l11 = []
l12 = []
l13 = []
l14 = []
l15 = []
for i in np.array(df_1['target']):
    if len(i) == 1:
        l1.append(i[0])
        l2.append(np.nan)
        l3.append(np.nan)
        l4.append(np.nan)
        l5.append(np.nan)
        l6.append(np.nan)
        l7.append(np.nan)
        l8.append(np.nan)
        l9.append(np.nan)
        l10.append(np.nan)
        l11.append(np.nan)
        l12.append(np.nan)
        l13.append(np.nan)
        l14.append(np.nan)
        l15.append(np.nan)
        
    if len(i) == 2:
        l1.append(i[0])
        l2.append(i[1])
        l3.append(np.nan)
        l4.append(np.nan)
        l5.append(np.nan)
        l6.append(np.nan)
        l7.append(np.nan)
        l8.append(np.nan)
        l9.append(np.nan)
        l10.append(np.nan)
        l11.append(np.nan)
        l12.append(np.nan)
        l13.append(np.nan)
        l14.append(np.nan)
        l15.append(np.nan)
    if len(i) == 3:
        l1.append(i[0])
        l2.append(i[1])
        l3.append(i[2])
        l4.append(np.nan)
        l5.append(np.nan)
        l6.append(np.nan)
        l7.append(np.nan)
        l8.append(np.nan)
        l9.append(np.nan)
        l10.append(np.nan)
        l11.append(np.nan)
        l12.append(np.nan)
        l13.append(np.nan)
        l14.append(np.nan)
        l15.append(np.nan)
    if len(i) == 4:
        l1.append(i[0])
        l2.append(i[1])
        l3.append(i[2])
        l4.append(i[3])
        l5.append(np.nan)
        l6.append(np.nan)
        l7.append(np.nan)
        l8.append(np.nan)
        l9.append(np.nan)
        l10.append(np.nan)
        l11.append(np.nan)
        l12.append(np.nan)
        l13.append(np.nan)
        l14.append(np.nan)
        l15.append(np.nan)
    if len(i) == 5:
        l1.append(i[0])
        l2.append(i[1])
        l3.append(i[2])
        l4.append(i[3])
        l5.append(i[4])
        l6.append(np.nan)
        l7.append(np.nan)
        l8.append(np.nan)
        l9.append(np.nan)
        l10.append(np.nan)
        l11.append(np.nan)
        l12.append(np.nan)
        l13.append(np.nan)
        l14.append(np.nan)
        l15.append(np.nan)
    if len(i) == 6:
        l1.append(i[0])
        l2.append(i[1])
        l3.append(i[2])
        l4.append(i[3])
        l5.append(i[4])
        l6.append(i[5])
        l7.append(np.nan)
        l8.append(np.nan)
        l9.append(np.nan)
        l10.append(np.nan)
        l11.append(np.nan)
        l12.append(np.nan)
        l13.append(np.nan)
        l14.append(np.nan)
        l15.append(np.nan)
    if len(i) == 7:
        l1.append(i[0])
        l2.append(i[1])
        l3.append(i[2])
        l4.append(i[3])
        l5.append(i[4])
        l6.append(i[5])
        l7.append(i[6])
        l8.append(np.nan)
        l9.append(np.nan)
        l10.append(np.nan)
        l11.append(np.nan)
        l12.append(np.nan)
        l13.append(np.nan)
        l14.append(np.nan)
        l15.append(np.nan)
    if len(i) == 8:
        l1.append(i[0])
        l2.append(i[1])
        l3.append(i[2])
        l4.append(i[3])
        l5.append(i[4])
        l6.append(i[5])
        l7.append(i[6])
        l8.append(i[7])
        l9.append(np.nan)
        l10.append(np.nan)
        l11.append(np.nan)
        l12.append(np.nan)
        l13.append(np.nan)
        l14.append(np.nan)
        l15.append(np.nan)
    if len(i) == 9:
        l1.append(i[0])
        l2.append(i[1])
        l3.append(i[2])
        l4.append(i[3])
        l5.append(i[4])
        l6.append(i[5])
        l7.append(i[6])
        l8.append(i[7])
        l9.append(i[8])
        l10.append(np.nan)
        l11.append(np.nan)
        l12.append(np.nan)
        l13.append(np.nan)
        l14.append(np.nan)
        l15.append(np.nan)
    if len(i) == 10:
        l1.append(i[0])
        l2.append(i[1])
        l3.append(i[2])
        l4.append(i[3])
        l5.append(i[4])
        l6.append(i[5])
        l7.append(i[6])
        l8.append(i[7])
        l9.append(i[8])
        l10.append(i[9])
        l11.append(np.nan)
        l12.append(np.nan)
        l13.append(np.nan)
        l14.append(np.nan)
        l15.append(np.nan)
    if len(i) == 11:
        l1.append(i[0])
        l2.append(i[1])
        l3.append(i[2])
        l4.append(i[3])
        l5.append(i[4])
        l6.append(i[5])
        l7.append(i[6])
        l8.append(i[7])
        l9.append(i[8])
        l10.append(i[9])
        l11.append(i[10])
        l12.append(np.nan)
        l13.append(np.nan)
        l14.append(np.nan)
        l15.append(np.nan)
    if len(i) == 12:
        l1.append(i[0])
        l2.append(i[1])
        l3.append(i[2])
        l4.append(i[3])
        l5.append(i[4])
        l6.append(i[5])
        l7.append(i[6])
        l8.append(i[7])
        l9.append(i[8])
        l10.append(i[9])
        l11.append(i[10])
        l12.append(i[11])
        l13.append(np.nan)
        l14.append(np.nan)
        l15.append(np.nan)
    if len(i) == 13:
        l1.append(i[0])
        l2.append(i[1])
        l3.append(i[2])
        l4.append(i[3])
        l5.append(i[4])
        l6.append(i[5])
        l7.append(i[6])
        l8.append(i[7])
        l9.append(i[8])
        l10.append(i[9])
        l11.append(i[10])
        l12.append(i[11])
        l13.append(i[12])
        l14.append(np.nan)
        l15.append(np.nan)
    if len(i) == 14:
        l1.append(i[0])
        l2.append(i[1])
        l3.append(i[2])
        l4.append(i[3])
        l5.append(i[4])
        l6.append(i[5])
        l7.append(i[6])
        l8.append(i[7])
        l9.append(i[8])
        l10.append(i[9])
        l11.append(i[10])
        l12.append(i[11])
        l13.append(i[12])
        l14.append(i[13])
        l15.append(np.nan)
    if len(i) == 15:
        l1.append(i[0])
        l2.append(i[1])
        l3.append(i[2])
        l4.append(i[3])
        l5.append(i[4])
        l6.append(i[5])
        l7.append(i[6])
        l8.append(i[7])
        l9.append(i[8])
        l10.append(i[9])
        l11.append(i[10])
        l12.append(i[11])
        l13.append(i[12])
        l14.append(i[13])
        l15.append(i[14])

In [353]:
df_1['target_1'] = np.array(l1)
df_1['target_2'] = np.array(l2)
df_1['target_3'] = np.array(l3)
df_1['target_4'] = np.array(l4)
df_1['target_5'] = np.array(l5)
df_1['target_6'] = np.array(l6)
df_1['target_7'] = np.array(l7)
df_1['target_8'] = np.array(l8)
df_1['target_9'] = np.array(l9)
df_1['target_10'] = np.array(l10)
df_1['target_11'] = np.array(l11)
df_1['target_12'] = np.array(l12)
df_1['target_13'] = np.array(l13)
df_1['target_14'] = np.array(l14)
df_1['target_15'] = np.array(l15)

In [364]:
df_1.fillna('10101010',inplace=True)

__Spliting (70/30)__
* Here we test/evaluate our models

In [366]:
# x = df_1.drop(['id','score'],axis=1).values
x = df_1[df_1.columns[5:]]
y = df_1['score'].values

In [368]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.30, random_state=141)

__Random Forest Regressor__

In [370]:
from sklearn.ensemble import RandomForestRegressor
regr = RandomForestRegressor(random_state=141)
regr.fit(x_train, y_train)
y_pred = np.round(regr.predict(x_test),decimals=2)
from sklearn.metrics import mean_squared_error, mean_absolute_error
mse = mean_squared_error(y_test, y_pred)
rmse = mean_squared_error(y_test, y_pred,squared=False)
mae = mean_absolute_error(y_test,y_pred)
print("Mean Square Error: {} \nRoot Mean Squared Error: {} \nMean Absolute Error: {} ".format(mse,rmse,mae))

Mean Square Error: 0.04834529336501554 
Root Mean Squared Error: 0.21987563158525672 
Mean Absolute Error: 0.16693657466642298 


__Lasso Regression__

In [372]:
from sklearn.linear_model import Lasso
lr = Lasso(alpha=0.5)
lr.fit(x_train, y_train)
y_pred = np.round(lr.predict(x_test),decimals=2)
mse = mean_squared_error(y_test, y_pred)
rmse = mean_squared_error(y_test, y_pred,squared=False)
mae = mean_absolute_error(y_test,y_pred)
print("Mean Square Error: {} \nRoot Mean Squared Error: {} \nMean Absolute Error: {} ".format(mse,rmse,mae))

Mean Square Error: 0.06468399744105283 
Root Mean Squared Error: 0.2543304886187514 
Mean Absolute Error: 0.20861177115700968 


__Prediction Submission__
* Will be used later for competition