### import libraries

In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics import accuracy_score, make_scorer

In [2]:
# read file into pandas using a relative path
train   = pd.read_csv("train.csv")
test    = pd.read_csv("test.csv")

In [3]:
train  = train[["Description","Is_Response"]]

In [4]:
train.shape

(38932, 2)

In [5]:
train.head()

Unnamed: 0,Description,Is_Response
0,The room was kind of clean but had a VERY stro...,not happy
1,I stayed at the Crown Plaza April -- - April -...,not happy
2,I booked this hotel through Hotwire at the low...,not happy
3,Stayed here with husband and sons on the way t...,happy
4,My girlfriends and I stayed here to celebrate ...,not happy


In [6]:
# examine the class distribution
train.Is_Response.value_counts()

happy        26521
not happy    12411
Name: Is_Response, dtype: int64

In [7]:
# convert label to a numerical variable
train['Is_Response'] = train.Is_Response.map({'not happy':0, 'happy':1})

In [8]:
train.head()

Unnamed: 0,Description,Is_Response
0,The room was kind of clean but had a VERY stro...,0
1,I stayed at the Crown Plaza April -- - April -...,0
2,I booked this hotel through Hotwire at the low...,0
3,Stayed here with husband and sons on the way t...,1
4,My girlfriends and I stayed here to celebrate ...,0


In [9]:
# how to define X and y (from the review data) for use with COUNTVECTORIZER
X = train.Description
y = train.Is_Response
print(X.shape)
print(y.shape)

(38932,)
(38932,)


# Stemmer

In [10]:
from nltk.stem import PorterStemmer
stemmer = PorterStemmer()
analyzer = CountVectorizer().build_analyzer()

def stemmed_words(doc):
    return (stemmer.stem(w) for w in analyzer(doc))

#stem_vectorizer = CountVectorizer(analyzer=stemmed_words)

In [11]:
#instantiate CountVectorizer (tune the parameter according to the dataset)
vect = CountVectorizer(stop_words='english',analyzer=stemmed_words, ngram_range = (1,1), min_df=150,max_df = 0.5)

In [12]:
# learn training data vocabulary, then use it to create a document-term matrix
vect.fit(X)

CountVectorizer(analyzer=<function stemmed_words at 0x7feecf399d90>,
        binary=False, decode_error='strict', dtype=<class 'numpy.int64'>,
        encoding='utf-8', input='content', lowercase=True, max_df=0.5,
        max_features=None, min_df=150, ngram_range=(1, 1),
        preprocessor=None, stop_words='english', strip_accents=None,
        token_pattern='(?u)\\b\\w\\w+\\b', tokenizer=None, vocabulary=None)

In [13]:
# examine the fitted vocabulary
vect.get_feature_names()

['aaa',
 'abl',
 'about',
 'abov',
 'absolut',
 'ac',
 'accept',
 'access',
 'accommod',
 'accomod',
 'account',
 'acknowledg',
 'across',
 'act',
 'action',
 'activ',
 'actual',
 'ad',
 'add',
 'addit',
 'address',
 'adequ',
 'adjac',
 'adjoin',
 'adjust',
 'admit',
 'adult',
 'advanc',
 'advantag',
 'advertis',
 'advic',
 'advis',
 'advisor',
 'affinia',
 'afford',
 'afraid',
 'after',
 'afternoon',
 'again',
 'against',
 'age',
 'agent',
 'ago',
 'agre',
 'ahead',
 'air',
 'airlin',
 'airport',
 'alamo',
 'alarm',
 'all',
 'alley',
 'allow',
 'almost',
 'alon',
 'along',
 'alot',
 'alreadi',
 'also',
 'altern',
 'although',
 'alway',
 'am',
 'amaz',
 'amazingli',
 'ambianc',
 'ambienc',
 'amen',
 'america',
 'american',
 'among',
 'amount',
 'ampl',
 'an',
 'angel',
 'ani',
 'anniversari',
 'annoy',
 'anoth',
 'answer',
 'antonio',
 'anyon',
 'anyth',
 'anyway',
 'anywher',
 'apart',
 'apolog',
 'apologet',
 'appar',
 'appeal',
 'appear',
 'appet',
 'appl',
 'appoint',
 'appreci',
 

In [14]:
#document term matrix
X_train_dtm = vect.transform(X)

In [15]:
# examine the document-term matrix
X_train_dtm

<38932x1969 sparse matrix of type '<class 'numpy.int64'>'
	with 2744302 stored elements in Compressed Sparse Row format>

In [16]:
# import and instantiate a logistic regression model
from sklearn.linear_model import LogisticRegression
logreg = LogisticRegression()

In [17]:
# train the model using X_train_dtm (timing it with an IPython "magic command")
%time logreg.fit(X_train_dtm, y)

CPU times: user 2.7 s, sys: 16 ms, total: 2.71 s
Wall time: 2.71 s


LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

test description should be stemmed.

In [18]:
# transform real testing data (using fitted vocabulary) into a document-term matrix
X_realtest_dtm = vect.transform(test.Description)
X_realtest_dtm

<29404x1969 sparse matrix of type '<class 'numpy.int64'>'
	with 2078998 stored elements in Compressed Sparse Row format>

In [19]:
# make class predictions for X_test_dtm
y_finalpred_class = logreg.predict(X_realtest_dtm)

In [20]:
def to_labels(x):
    if x == 1:
        return "happy"
    return "not_happy"

In [21]:
sub1 = pd.DataFrame({'User_ID':test.User_ID, 'Is_Response':y_finalpred_class})
sub1['Is_Response'] = sub1['Is_Response'].map(lambda x: to_labels(x))

In [22]:
sub1 = sub1[['User_ID', 'Is_Response']]

In [23]:
## write submission files
sub1.to_csv('submission/sub6_cv.csv', index=False)