In [1]:
import pandas as pd
import numpy as np
import scipy as sp
import re


#CrossVal
from sklearn.cross_validation import cross_val_score
from sklearn.cross_validation import StratifiedKFold as KFold
from sklearn.cross_validation import train_test_split
from sklearn.cross_validation import StratifiedShuffleSplit as SSplit
from sklearn.metrics import mean_squared_error


#GridSearch
from sklearn.grid_search import GridSearchCV

#models
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Ridge
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import LinearSVC

#ensembles
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import ExtraTreesRegressor
from sklearn.ensemble import GradientBoostingRegressor
# from sklearn.ensemble import VotingClassifier
import xgboost as xgb

#text stuff
from sklearn.feature_extraction.text import TfidfVectorizer

#pipeline 
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import FeatureUnion
from sklearn.pipeline import Pipeline



In [6]:
#testing stuff
from sklearn.preprocessing import MinMaxScaler


In [37]:
data = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')
data.head()

Unnamed: 0,id,tid,dept,date,forcredit,attendance,textbookuse,interest,grade,tags,comments,helpcount,nothelpcount,online,profgender,profhotness,helpfulness,clarity,easiness,quality
0,24228248,916674,Business,01/05/2015,Yes,,It's a must have,Really into it,,"[""Would take again"", ""Hilarious"", ""Tests are t...",Great Professor My wife took this class twice ...,0,10,,0,0,4,5,3,9
1,24218909,916674,Business,01/02/2015,Yes,Mandatory,It's a must have,Sorta interested,A,"[""Skip class? You won't pass."", ""Tests are tou...",Great Professor Study the notes from class and...,0,1,,0,0,4,4,2,8
2,24215795,916674,Business,01/02/2015,Yes,,Essential to passing,Really into it,,"[""Hilarious"", ""Would take again"", ""Skip class?...",Brother Brau is a great guy He gives great spi...,1,2,,0,0,4,4,3,8
3,24204179,916674,Business,12/30/2014,Yes,Not Mandatory,Essential to passing,Sorta interested,,"[""Tests are tough"", ""Get ready to read""]",People rave about Brau but I personally dont g...,18,6,,0,0,3,1,2,4
4,24198463,916674,Business,12/28/2014,Yes,Not Mandatory,You need it sometimes,Sorta interested,A,"[""Inspirational"", ""Hilarious"", ""Skip class? Yo...",This class doesnt have much homework which was...,1,0,,0,0,4,4,4,8


array([0, 4, 1, 6, 2, 5, 3])

In [55]:
#Custom Transformers
class ItemSelector(BaseEstimator, TransformerMixin):
    def __init__(self, key):
        self.key = key

    def fit(self, x, y=None):
        return self

    def transform(self, data_dict):
        return data_dict[self.key]

class DateConvertor(TransformerMixin):
    def fit(self , x, y=None):
        return self
    
    def transform(self, feature):
        return 
    
class SparseConvertor(TransformerMixin):
    def fit(self, x, y=None):
        return self
    
    def transform(self, feature):
        return sp.sparse.csr_matrix(feature).T
        

class DummyCreator(TransformerMixin):
    def fit(self, x, y=None):
        return self

    def transform(self, single_feature):
        return pd.get_dummies(single_feature)
    
    def get_feature_names(self):
        return self.features.columns.values
    
class TagsDummyCreator(TransformerMixin):
    def fit(self, x, y=None):
        return self
    
    def transform(self, single_feature):
        single_feature = single_feature.apply(lambda x: '*'.join(eval(x)))
        return single_feature.str.get_dummies(sep ='*')

class CheckinListTransformer(TransformerMixin):
    def __init__(self, keywords =[]):
        self.keywords= keywords
    
    def fit(self, x, y = None):
        return  self
    
    def transform(self, single_feature):
        func = lambda x: any(i in x for i in self.keywords)
        return single_feature.apply(func).to_frame()

class CheckinListTransformer(TransformerMixin):
    def __init__(self, keywords =[]):
        self.keywords= keywords
    
    def fit(self, x, y = None):
        return  self
    
    def transform(self, single_feature):
        func = lambda x: any(i in x for i in self.keywords)
        return single_feature.apply(func).to_frame()

class DateTransformer(TransformerMixin):
    def fit(self, x, y=None):
        return self

    def transform(self, single_feature):
        return single_feature.apply(lambda x: pd.to_datetime(x))
    
class GetDayTransformer(TransformerMixin):
    def fit(self, x, y=None):
        return self

    def transform(self, single_feature):
        return single_feature.apply(lambda x: x.dayofweek)

class GetMonthTransformer(TransformerMixin):
    def fit(self, x, y=None):
        return self

    def transform(self, single_feature):
        return single_feature.apply(lambda x: x.month) 

In [56]:
features = FeatureUnion(transformer_list=[
        ('forcredit' ,Pipeline([
                ('getforcredit', ItemSelector(key='forcredit')),
                ('getdummies' , DummyCreator())
            ])),
        
        ('attendance' ,Pipeline([
                ('getattendance', ItemSelector(key='attendance')),
                ('getdummies' , DummyCreator())
            ])),
        
        ('textbookuse' ,Pipeline([
                ('gettextbookuse', ItemSelector(key='textbookuse')),
                ('getdummies' , DummyCreator())
            ])),
        
        ('interest' ,Pipeline([
                ('getinterets', ItemSelector(key='interest')),
                ('getdummies' , DummyCreator())
            ])),
        
         ('grade' ,Pipeline([
                ('getgrade', ItemSelector(key='grade')),
                ('getdummies' , DummyCreator())
            ])),        
        
         ('profgender' ,Pipeline([
                ('getprofgender', ItemSelector(key='profgender')),
                ('getdummies' , DummyCreator())
            ])),
        
         ('profhotness' ,Pipeline([
                ('getprofhotness', ItemSelector(key='profhotness')),
                ('getdummies' , DummyCreator())
            ])),
        
        ('online' ,Pipeline([
                ('getonline', ItemSelector(key='online')),
                ('getdummies' , DummyCreator())
            ])),
        
        ('tags' ,Pipeline([
                ('gettags', ItemSelector(key='tags')),
                ('getdummies' , TagsDummyCreator())
            ])),
        
        ('comments' , Pipeline([
                ('getcomments' , ItemSelector(key='comments')),
                ('tfidf', TfidfVectorizer(max_features=20000,
                                          ngram_range=(1,2)))
            ])),
        ('isstem' ,Pipeline([
                ('getdept', ItemSelector(key='dept')),
                ('isstem' , CheckinListTransformer(keywords=[
                                'science' , 'mathematics' ,
                                'logy' , 'engineering'
                            ]))
            ])),
        ('ishumanities' ,Pipeline([
                ('getdept', ItemSelector(key='dept')),
                ('ishumanities' , CheckinListTransformer(keywords=[
                                'studies'
                            ]))
            ])),
        ('helpcount', Pipeline([
                    ('helpcount', ItemSelector(key='helpcount')),
                    ('sparse', SparseConvertor())
                ])),
        ('nothelpcount', Pipeline([
                    ('nothelpcount', ItemSelector(key='nothelpcount')),
                    ('sparse', SparseConvertor())

                ])),
        ('getdayofweek' ,Pipeline([
                    ('date', ItemSelector(key='date')),
                    ('dateconvert', DateTransformer()),
                    ('dayofweek', GetDayTransformer()),
                    ('getdummies', DummyCreator())
                ])),
#         ('getmonth' ,Pipeline([
#                     ('date', ItemSelector(key='date')),
#                     ('dateconvert', DateTransformer()),
#                     ('month', GetMonthTransformer()),
#                     ('getdummies', DummyCreator())
#                 ])),
    ])
         

In [62]:
#Cross Val
data = pd.read_csv('train.csv')

data['comments'] = data['comments'].fillna('')
data['grade'] = data['grade'].fillna('D')
train, test = train_test_split(data)
feat = features.fit_transform(train)
model = Ridge()
model.fit(feat, train['quality'])
testfeat = features.transform(test)
cvpredict = np.array( model.predict(testfeat))
cvpredict = np.clip(cvpredict, a_max=10, a_min=2)
mse = mean_squared_error(test['quality'], cvpredict )
print mse

2.50347101011


In [60]:
train = pd.read_csv('train.csv')
train['comments'] = train['comments'].fillna('')

test = pd.read_csv('test.csv')
test['comments'] = test['comments'].fillna('')

feat = features.fit_transform(train)

model.fit(feat, train['quality'])

testfeat = features.transform(test)
prediction = model.predict(testfeat)


In [13]:
test_ids = test['id']
with open('results.txt','w+') as opfile:
    opfile.write("id,quality\n")
    for i in range(test.shape[0]):
#         print i
        opfile.write(str(test_ids[i])+","+str(prediction[i])+"\n")

opfile.close()


2.65832237814
