In [12]:
import pandas as pd
import numpy as np
import re

#CrossVal
from sklearn.cross_validation import cross_val_score
from sklearn.cross_validation import StratifiedKFold as KFold
from sklearn.cross_validation import train_test_split
from sklearn.cross_validation import StratifiedShuffleSplit as SSplit
from sklearn.metrics import mean_squared_error


#GridSearch
from sklearn.grid_search import GridSearchCV
#models
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import LinearSVC

#ensembles
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import ExtraTreesRegressor
from sklearn.ensemble import GradientBoostingRegressor
# from sklearn.ensemble import VotingClassifier
import xgboost as xgb

#text stuff
from sklearn.feature_extraction.text import TfidfVectorizer



In [33]:
#Just the preprocessing funtion. Play around with various features
#I'm not using all the features yet

def preprocessing(df):
    df = df.drop(['id', 'tid', 'date','dept' ], axis = 1)
#      #One-hot forcredit
#     df = df.join(pd.get_dummies(df.forcredit, prefix= 'forcredit'))
    df = df.drop('forcredit', axis =1 )
#     #One-hot attendance
#     df = df.join(pd.get_dummies(df.attendance, prefix= 'attendance'))
    df = df.drop('attendance', axis = 1)
#      #One-hot textbookuse
    df = df.join(pd.get_dummies(df.textbookuse, prefix= 'textbk'))
    df = df.drop('textbookuse' , axis =1)
#      #One-hot interest
    df = df.join(pd.get_dummies(df.interest, prefix= 'interest'))
    df = df.drop('interest', axis =1)
#      #One-hot grade
    df = df.join(pd.get_dummies(df.grade, prefix= 'grade'))
    df = df.drop('grade', axis =1)
#      #One-hot profgender
    df = df.join(pd.get_dummies(df.profgender, prefix= 'profgender'))
    df = df.drop('profgender', axis =1)
#      #One-hot profhotness
    df = df.join(pd.get_dummies(df.profhotness, prefix= 'profhotness'))
    df = df.drop('profhotness', axis =1)
#      #One-hot online
    df = df.join(pd.get_dummies(df.online, prefix= 'online'))
    df = df.drop('online', axis =1)
     #One-hot tags
    df.tags = df.tags.apply(lambda x: '*'.join(eval(x)))
    df = df.join( df.tags.str.get_dummies(sep='*'))
    df = df.drop('tags', axis =1)
    
    return df 

In [34]:
#CrossVal - I'm splitting the data into train and test, running the
# preprocessing on all the training sets. Change the second number
#in SSplit to control tehe number of iteration of crossval it should do
#Try differnet models  -LR seems best for now.


data = pd.read_csv('train.csv')
Y = data['quality']
sss = SSplit(Y,1 ,test_size=0.4, random_state=7)

for train_index, test_index in sss:
    X_train, X_test = data.iloc[train_index], data.iloc[test_index]
    Y_train, Y_test = Y[train_index], Y[test_index]
    dropcolnames = []
    dropcolnames.extend(['quality', 
                         'helpfulness', 'clarity', 'easiness'])

    X_train = X_train.drop(dropcolnames,axis = 1)
    X_test = X_test.drop(dropcolnames, axis = 1)

    X_train = preprocessing(X_train)  


    # #tfidf on train
    tfidf2 = TfidfVectorizer(token_pattern=r'(?u)\b[A-Za-z][A-Za-z]+\b',
                             max_features= 2000)
    X_train.comments.fillna(value = "", inplace=True)
    X_counts = tfidf2.fit_transform(X_train.comments)
    colnames = tfidf2.get_feature_names()
    countsdf = pd.DataFrame(data= X_counts.toarray(), 
                            columns=colnames, 
                            index = X_train.index.values)
    countsdf = countsdf.add_prefix('comments_')
    X_train = X_train.join(countsdf)
    X_train = X_train.drop('comments', axis =1)

    X_test =  preprocessing(X_test)

    # #tfidf on test
    X_test.comments.fillna(value = "", inplace=True)
    #the next line is just tranform for test set 
    #it was fit_transform for train
    X_counts = tfidf2.transform(X_test.comments)
    colnames = tfidf2.get_feature_names()
    countsdf = pd.DataFrame(data= X_counts.toarray(),
                            columns=colnames, 
                            index = X_test.index.values)
    countsdf = countsdf.add_prefix('comments_')
    X_test = X_test.join(countsdf)
    X_test = X_test.drop('comments', axis =1)

    model = LinearRegression()
    cv = GridSearchCV(model, {}).fit(X_train,Y_train )
    # for col in train:
    #      print col, np.any(train[col].isnull())

    print("R Squared: {}".format(cv.best_score_))

    # Output the Mean Squared Error using our held out training data
    mse = mean_squared_error(Y_test, cv.predict(X_test))
    print("MSE: {}".format(mse))

R Squared: 0.573069770675
MSE: 2.97629965055


In [8]:
#run this once crossval gives a good score.
#If you changed something in the TFIDF, above you'll need to change it here

train = pd.read_csv("train.csv")
test = pd.read_csv("test.csv")
dropcolnames = list(set(train.columns) - set(test.columns))
dropcolnames.extend(['quality', 'online'])
Y = train['quality']
train = train.drop(dropcolnames,axis = 1)
test = test.drop('online', axis = 1)
test_ids = test['id']

train = preprocessing(train)  

#tfidf on train
tfidf = TfidfVectorizer( token_pattern=r'(?u)\b[A-Za-z][A-Za-z]+\b', max_features=1500)
train.comments.fillna(value = "", inplace=True)
X_counts = tfidf.fit_transform(train.comments)
colnames = tfidf.get_feature_names()
countsdf = pd.DataFrame(data= X_counts.toarray(), columns=colnames, index = train.index.values)
countsdf = countsdf.add_prefix('comments_')
train = train.join(countsdf)
train = train.drop('comments', axis =1)

test =  preprocessing(test)
#tfidf on test
test.comments.fillna(value = "", inplace=True)
X_counts = tfidf.transform(test.comments)
colnames = tfidf.get_feature_names()
countsdf = pd.DataFrame(data= X_counts.toarray(), columns=colnames, index = test.index.values)
countsdf = countsdf.add_prefix('comments_')
test = test.join(countsdf)
test = test.drop('comments', axis =1)


In [9]:
model.fit(train, Y)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=1, normalize=False)

In [10]:
prediction = model.predict(test)

In [11]:
with open('results.txt','w+') as opfile:
    opfile.write("id,quality\n")
    for i in range(test.shape[0]):
#         print i
        opfile.write(str(test_ids[i])+","+str(prediction[i])+"\n")

opfile.close()