In [1]:
import numpy as np
import pandas as pd
import nltk
import matplotlib.pyplot as plt
import re
import pickle
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.svm import SVR
from sklearn.metrics import mean_squared_error
from sklearn.linear_model import LinearRegression
from nltk.corpus import stopwords 
from nltk.tokenize import word_tokenize
import warnings
warnings.filterwarnings("ignore", category=FutureWarning)

In [2]:
df = pd.read_csv("./training_set_rel3.tsv", sep='\t', encoding='ISO-8859-1')
df = df.dropna(axis=1)
df = df.drop(columns=['rater1_domain1', 'rater2_domain1'])
#max(df[df['essay_set']==8]['domain1_score'])

In [3]:
min_range = [2,1,0,0,0,0,0,0]
max_range = [12,6,3,3,4,4,30,60]

def normalize(x,minScore,maxScore):
     x = (x-minScore)/(maxScore-minScore)
     return round(x*10)

df['final_score']=df.apply(lambda x:normalize(x['domain1_score'],min_range[x['essay_set']-1],max_range[x['essay_set']-1]),axis=1)
df.head()

Unnamed: 0,essay_id,essay_set,essay,domain1_score,final_score
0,1,1,"Dear local newspaper, I think effects computer...",8,6
1,2,1,"Dear @CAPS1 @CAPS2, I believe that using compu...",9,7
2,3,1,"Dear, @CAPS1 @CAPS2 @CAPS3 More and more peopl...",7,5
3,4,1,"Dear Local Newspaper, @CAPS1 I have found that...",10,8
4,5,1,"Dear @LOCATION1, I know having computers has a...",8,6


In [4]:
def clean_essay(essay):
    x=[]
    for i in essay.split():
        if i.startswith("@"):
            continue
        else:
            x.append(i)
    return ' '.join(x)

df['essay'] = df['essay'].apply(lambda x:clean_essay(x))


In [5]:
stop_words = set(stopwords.words('english')) 
def remove_stop_words(essay):
    word_tokens = word_tokenize(essay) 
    filtered_sentence = [] 
    for w in word_tokens: 
        if w not in stop_words: 
            filtered_sentence.append(w)
    return ' '.join(filtered_sentence)

df['clean_essay'] = df['essay'].apply(lambda x:remove_stop_words(x))

In [6]:
def remove_puncs(essay):
    essay = re.sub("[^A-Za-z ]","",essay)
    return essay

df['clean_essay'] = df['clean_essay'].apply(lambda x:remove_puncs(x))
df.head()

Unnamed: 0,essay_id,essay_set,essay,domain1_score,final_score,clean_essay
0,1,1,"Dear local newspaper, I think effects computer...",8,6,Dear local newspaper I think effects computer...
1,2,1,Dear I believe that using computers will benef...,9,7,Dear I believe using computers benefit us many...
2,3,1,"Dear, More and more people use computers, but ...",7,5,Dear More people use computers everyone agre...
3,4,1,"Dear Local Newspaper, I have found that many e...",10,8,Dear Local Newspaper I found many experts say...
4,5,1,Dear I know having computers has a positive ef...,8,6,Dear I know computers positive effect people ...


In [7]:
def sent2word(x):
    x=re.sub("[^A-Za-z0-9]"," ",x)
    words=nltk.word_tokenize(x)
    return words

def essay2word(essay):
    essay = essay.strip()
    tokenizer = nltk.data.load('tokenizers/punkt/english.pickle')
    raw = tokenizer.tokenize(essay)
    final_words=[]
    for i in raw[:1]:
        if(len(i)>0):
            final_words.append(sent2word(i))
    return final_words
        

def noOfWords(essay):
    count=0
    for i in essay2word(essay):
        count=count+len(i)
    return count

def noOfChar(essay):
    count=0
    for i in essay2word(essay):
        for j in i:
            count=count+len(j)
    return count

def avg_word_len(essay):
    words = noOfWords(essay)
    if words == 0:
        return 0
    return noOfChar(essay)/words

def noOfSent(essay):
    return len(essay2word(essay))

def count_pos(essay):
    sentences = essay2word(essay)
    noun_count=0
    adj_count=0
    verb_count=0
    adverb_count=0
    for i in sentences:
        pos_sentence = nltk.pos_tag(i)
        for j in pos_sentence:
            pos_tag = j[1]
            if(pos_tag[0]=='N'):
                noun_count+=1
            elif(pos_tag[0]=='V'):
                verb_count+=1
            elif(pos_tag[0]=='J'):
                adj_count+=1
            elif(pos_tag[0]=='R'):
                adverb_count+=1
    
    return noun_count,verb_count,adj_count,adverb_count


def check_spell_error(essay):
    essay=essay.lower()
    new_essay = re.sub("[^A-Za-z0-9]"," ",essay)
    new_essay = re.sub("[0-9]","",new_essay)
    count=0
    all_words = new_essay.split()
    for i in all_words:
        if i not in words:
            count+=1
    return count

data = open('big.txt').read()
words = re.findall('[a-z]+', data.lower())

In [37]:
vectorizer = CountVectorizer(max_features = 10000, ngram_range=(1, 3), stop_words='english')
#count_vectors = vectorizer.fit_transform(df['clean_essay'])

cv = vectorizer.fit(df['clean_essay'])
count_vectors = vectorizer.transform(df['clean_essay'])
pickle.dump(cv, open("vector.pickle", "wb"))

feature_names = vectorizer.get_feature_names()
data = df[['essay_set','clean_essay','final_score']].copy()
X = count_vectors.toarray()
y = data['final_score'].to_numpy()
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3)



[[0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 ...
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]]


In [29]:
#rf = RandomForestRegressor(n_estimators = 2, random_state = 42)
#rf.fit(X_train, y_train)
#pickle.dump(rf, open('RF_without_PP.sav', 'wb'))

#Use Saved Model
rf = pickle.load(open('RF_without_PP', 'rb'))
predictions = rf.predict(X_test)
print("Mean squared error: %.2f" % mean_squared_error(y_test, predictions))

Mean squared error: 2.14


In [12]:

# pro_data = df.copy()
# pro_data['char_count'] = pro_data['essay'].apply(noOfChar)
# pro_data['word_count'] = pro_data['essay'].apply(noOfWords)
# pro_data['sent_count'] = pro_data['essay'].apply(noOfSent)
# pro_data['avg_word_len'] = pro_data['essay'].apply(avg_word_len)
# pro_data['spell_err_count'] = pro_data['essay'].apply(check_spell_error)
# pro_data['noun_count'], pro_data['verb_count'], pro_data['adj_count'], pro_data['adv_count'] = zip(*pro_data['essay'].map(count_pos))
# #pro_data['noun_count'] = pro_data['essay'].apply(count_pos)

# pro_data.to_csv("Processed_data.csv")

In [36]:
prep_df = pd.read_csv("Processed_data.csv")
prep_df.drop('Unnamed: 0',inplace=True,axis=1)

vectorizer = CountVectorizer(max_features = 10000, ngram_range=(1, 3), stop_words='english')
#count_vectors = vectorizer.fit_transform(prep_df['clean_essay'])

cv = vectorizer.fit(df['clean_essay'])
count_vectors = vectorizer.transform(df['clean_essay'])
pickle.dump(cv, open("vector_additional.pickle", "wb"))

feature_names = vectorizer.get_feature_names()

X = count_vectors.toarray()
X_full = np.concatenate((prep_df.iloc[:, 6:].to_numpy(), X), axis = 1)

y_full = prep_df['final_score'].to_numpy()
X_train, X_test, y_train, y_test = train_test_split(X_full, y_full, test_size = 0.3)

[[183.  39.   1. ...   0.   0.   0.]
 [128.  25.   1. ...   0.   0.   0.]
 [ 76.  15.   1. ...   0.   0.   0.]
 ...
 [ 33.  10.   1. ...   0.   0.   0.]
 [ 89.  20.   1. ...   0.   0.   0.]
 [ 47.   9.   1. ...   0.   0.   0.]]


In [10]:
#Save Trained Model
# rf = RandomForestRegressor(n_estimators = 10, random_state = 42)
# rf.fit(X_train, y_train)
# pickle.dump(rf, open('RF_with_PP', 'wb'))

#Use Saved Model
rf = pickle.load(open('RF_with_PP', 'rb'))
y_pred = rf.predict(X_test)
print("Mean squared error: %.2f" % mean_squared_error(y_test, y_pred))
# for i in range(500,510):
#      print(y_pred[i],y_full[i])

Mean squared error: 1.28


In [26]:
from sklearn.metrics import cohen_kappa_score

y_test_list = y_test.tolist()
y_pred_list = []

for i in range(len(y_pred)):
#     if str(y_pred[i]) == 'nan':
#         y_pred[i] = 0    
     y_pred_list.append(round(y_pred[i]))

kappa = cohen_kappa_score(y_test_list, y_pred_list,weights='quadratic')
kappa

0.8678154765344008