In [1]:
import nltk
import re
import pandas as pd
import numpy as np
import os
from pathlib import Path
from scipy import stats

In [2]:
#import data
train_data = "/Users/zxj/Desktop/study/semester3/MCS/humor/data/task-1/train.csv"
test_data="/Users/zxj/Desktop/study/semester3/MCS/humor/data/task-1/dev.csv"

#get data with ascending sort by score
train=pd.read_csv(train_data)
column=train.columns.values
test=train[9100:]
train=train[:9100]
test = test.sort_values(column[4],ascending=False)
test_label=test.meanGrade

#get label
train_label=train.meanGrade
train_label=train_label

#get orignial news
train_news=train.original
train_funny_word=train.edit
test_news=test.original
test_funny_word=test.edit
a=[]
for i in test_funny_word:
    a.append(i)
test_funny_word=a


In [3]:
#extract edited word from news
def find_tag(news):
    p = re.compile(r'[<](.*?)/[>]', re.S)
    return re.findall(p, news)
print(train_news[0])

France is ‘ hunting down its citizens who joined <Isis/> ’ without trial in Iraq


In [4]:
#do preprocess
from nltk.corpus import stopwords
stopwords=set(stopwords.words('english'))
tt=nltk.tokenize.regexp.WordPunctTokenizer()
lemmatizer=nltk.stem.wordnet.WordNetLemmatizer()

def preprocess_news(sentence,funny_word):
    final=[]
    num=0
    for event in sentence:
        BOW={}
        ori=event
        
        #tokenize and remove Punctuation
        event=removePunctuation(event)
        event=tt.tokenize(event)
        for words in event:
            words=lemmatizer.lemmatize(words)
            if words not in stopwords:
                
                #generate Bag of words
                BOW[words.lower()]=BOW.get(words,0)+1
        
        #give more weight for funny word
        BOW[funny_word[num].lower()]=BOW.get(funny_word[num],0)+10
        final.append(BOW)
        num+=1
    return final

punctuation = '!,;:?"\'.\'/<>'
def removePunctuation(text):
    text = re.sub(r'[{}]+'.format(punctuation),'',text)
    return text.strip()

train_news=preprocess_news(train_news,train_funny_word)
test_news=preprocess_news(test_news,test_funny_word)

In [5]:
print(len(train_news))
print(len(test_news))
print(len(train_label))
print(len(test_label))
print(train_news[0])

9100
552
9100
552
{'france': 1, '‘': 1, 'hunting': 1, 'citizen': 1, 'joined': 1, 'isis': 1, '’': 1, 'without': 1, 'trial': 1, 'iraq': 1, 'twins': 10}


In [6]:
#transform to vector
from sklearn.feature_extraction import DictVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
vectorizer=DictVectorizer()
transformer=TfidfTransformer(smooth_idf=False,norm=None)
train_matrix=vectorizer.fit_transform(train_news)
#use TF_IDF
train_matrix=transformer.fit_transform(train_matrix)

#transform test
test_matrix=vectorizer.transform(test_news)
test_matrix=transformer.transform(test_matrix)
print(test_matrix.shape)
print(train_label.shape)
print(train_matrix.shape)

(552, 10191)
(9100,)
(9100, 10191)


In [7]:
#do prediction and evaluation matrix for later works
def prediction(model):
    #best=model.fit(train_matrix,train_label)
    pre=model.predict(test_matrix)
    rmse = np.sqrt(np.mean((test_label - pre)**2))
    psr=stats.pearsonr(np.array(pre),np.array(test_label))
    print("The person correlation coefficient is:")
    print(psr)
    print("The result of RMSE in the whole test data is:")
    print(rmse)
    pre=model.predict(test_matrix[:100])
    rmse = np.sqrt(np.mean((test_label[:100] - pre)**2))    
    print("The result of RMSE in the BEST 10% test data is:")
    print(rmse)
    pre=model.predict(test_matrix[:200])
    rmse = np.sqrt(np.mean((test_label[:200] - pre)**2))    
    print("The result of RMSE in the BEST 20% test data is:")
    print(rmse)
    pre=model.predict(test_matrix[:300])
    rmse = np.sqrt(np.mean((test_label[:300] - pre)**2))    
    print("The result of RMSE in the BEST 30% test data is:")
    print(rmse)

In [8]:
from sklearn.model_selection import ShuffleSplit,GridSearchCV
from sklearn import linear_model
from sklearn.metrics import make_scorer,mean_squared_error,r2_score

#use MSE as scorer
#scorer=make_scorer(mean_squared_error)
scorer=make_scorer(r2_score)
cv = ShuffleSplit(n_splits=5, test_size=0.1, random_state=0)

#LASSO Method (alpha from 0.1 to 100)
a=np.logspace(-2,4,10)
lasso=linear_model.Lasso(fit_intercept=True)
grid=GridSearchCV(lasso,param_grid={'alpha':a},cv=cv,scoring=scorer)
grid.fit(train_matrix,train_label)

print("The best parameters are {0.best_params_} with R2——score of {0.best_score_:.3g}".format(grid))
best=grid.best_estimator_ 
prediction(best)


The best parameters are {'alpha': 0.01} with R2——score of 0.0798
The person correlation coefficient is:
(0.334611925031881, 6.5954222220342e-16)
The result of RMSE in the whole test data is:
0.5330091503585478
The result of RMSE in the BEST 10% test data is:
0.8391077177188763
The result of RMSE in the BEST 20% test data is:
0.6467736174689563
The result of RMSE in the BEST 30% test data is:
0.5334505744550625


In [12]:
test_eval_data = "/Users/zxj/Desktop/study/semester3/MCS/humor/data/task-1/data/task-1/test_eval.csv"

test_eval=pd.read_csv(test_eval_data)
test_eval_label=test_eval.meanGrade

#get orignial news
test_eval_news=test_eval.original
test_eval_funny_word=test_eval.edit

test_eval_news=preprocess_news(test_eval_news,test_eval_funny_word)
test_eval_matrix=vectorizer.transform(test_eval_news)
test_eval_matrix=transformer.transform(test_eval_matrix)

pre=grid.predict(test_eval_matrix)
rmse = np.sqrt(np.mean((test_eval_label - pre)**2))

import pandas as pd
id=test_eval.id
dataframe = pd.DataFrame({'id':id,'pred':pre})
dataframe.to_csv("task-1-output.csv",index=False,sep=',')

In [9]:
#ridge method 
ridge=linear_model.Ridge(solver='sparse_cg')
a=np.logspace(-2,5,10)
grid2=GridSearchCV(ridge,param_grid={'alpha':a},cv=cv,scoring=scorer)
grid2.fit(train_matrix,train_label)

print("The best parameters are {0.best_params_} with R2-score of {0.best_score_:.3g}".format(grid2))
prediction(grid2)

The best parameters are {'alpha': 16681.005372000593} with R2-score of 0.0921
The person correlation coefficient is:
(0.334558900664955, 6.669315156841847e-16)
The result of RMSE in the whole test data is:
0.5332356662450543
The result of RMSE in the BEST 10% test data is:
0.8166910431997259
The result of RMSE in the BEST 20% test data is:
0.6333492806158451
The result of RMSE in the BEST 30% test data is:
0.5262552755566311


In [None]:
#elasticnet method
#alpha and ratio for L1
a=np.linspace(0.01,5,10)
b=np.linspace(0,1,5)
elastic=linear_model.ElasticNet()

grid3=GridSearchCV(elastic,param_grid={'alpha':a,'l1_ratio':b},cv=cv,scoring=make_scorer(mean_squared_error))
grid3.fit(train_matrix,train_label)

print("The best parameters are {0.best_params_} with R2-score of {0.best_score_:.3g}".format(grid3))
prediction(grid3)

In [None]:
### from sklearn.svm import SVR
c_range=np.logspace(-2,5,10)
gamma=np.logspace(-5,1,10)
degree=[0,1,2,3,4]
svr=SVR(kernel='poly')

grid4=GridSearchCV(svr,param_grid={'epsilon':epsilon,'C':c_range,'degree':degree},cv=cv,scoring=scorer)
grid4.fit(train_matrix,train_label)
print("The best parameters are {0.best_params_} with R2-score of {0.best_score_:.3g}".format(grid4))
prediction(grid4)