In [1]:
import pandas as pd
import numpy as np

### Load Data

In [2]:
train_df = pd.read_csv('../train.csv', sep="\t")
test_df = pd.read_csv("../test.csv", sep="\t")

### Sentiment Analysis

In [5]:
from nltk.sentiment.vader import SentimentIntensityAnalyzer
sentiments = SentimentIntensityAnalyzer()

In [6]:
# The following pandas series contains a dictionary. Faster to do
# it this way and later split it based on columns
# (Slower, as pandas apply operation uses only single core)

train_df["answer_polarity_scores"] = train_df['answer_text'].apply(
    lambda x: sentiments.polarity_scores(x))
test_df["answer_polarity_scores"] = test_df['answer_text'].apply(
    lambda x: sentiments.polarity_scores(x))
train_df["question_polarity_scores"] = train_df['question_text'].apply(
    lambda x: sentiments.polarity_scores(x))
test_df["question_polarity_scores"] = test_df['question_text'].apply(
    lambda x: sentiments.polarity_scores(x))


In [9]:
## This could easily be done by creating a function or for loop
## I just did it during the hackathon. But if you are combining
## everything, just write a function to make it look more beautiful

##############
# Train data #
##############

# Sentiment for question_text (q_***)
train_df['q_compound'] = train_df["question_polarity_scores"].apply(lambda x: x['compound'])
train_df['q_pos']= train_df["question_polarity_scores"].apply(lambda x: x['pos'])
train_df['q_neg']= train_df["question_polarity_scores"].apply(lambda x: x['neg'])
train_df['q_neu']= train_df["question_polarity_scores"].apply(lambda x: x['neu'])

# Sentiment for answer_text (a_***)
train_df['a_compound'] = train_df["answer_polarity_scores"].apply(lambda x: x['compound'])
train_df['a_pos']= train_df["answer_polarity_scores"].apply(lambda x: x['pos'])
train_df['a_neg']= train_df["answer_polarity_scores"].apply(lambda x: x['neg'])
train_df['a_neu']= train_df["answer_polarity_scores"].apply(lambda x: x['neu'])

#############
# Test data #
#############

# Sentiment for question_text (q_***)
test_df['q_compound'] = test_df["question_polarity_scores"].apply(lambda x: x['compound'])
test_df['q_pos']= test_df["question_polarity_scores"].apply(lambda x: x['pos'])
test_df['q_neg']= test_df["question_polarity_scores"].apply(lambda x: x['neg'])
test_df['q_neu']= test_df["question_polarity_scores"].apply(lambda x: x['neu'])

# Sentiment for answer_text (a_***)
test_df['a_compound'] = test_df["answer_polarity_scores"].apply(lambda x: x['compound'])
test_df['a_pos']= test_df["answer_polarity_scores"].apply(lambda x: x['pos'])
test_df['a_neg']= test_df["answer_polarity_scores"].apply(lambda x: x['neg'])
test_df['a_neu']= test_df["answer_polarity_scores"].apply(lambda x: x['neu'])

In [10]:
# Now drop the columns that contains the dictionary of sentiments
# It is not needed anymore
train_df.drop(["question_polarity_scores", "answer_polarity_scores"], 
              axis=1, inplace=True)
test_df.drop(["question_polarity_scores", "answer_polarity_scores"], 
              axis=1, inplace=True)

### TF-IDF features

In [38]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import make_pipeline, make_union
from sklearn.base import BaseEstimator, TransformerMixin

In [48]:
vectorizer = TfidfVectorizer(stop_words='english', 
                             max_features=10,
                             use_idf=True,
                             norm='l1'
                            )

In [52]:
# TF-IDF vectorizer for train and test sets on question_text
train_q = vectorizer.fit_transform(train_df["question_text"])
train_q_df = pd.DataFrame(train_q.toarray(), columns=vectorizer.get_feature_names())
train_df.join(train_q_df)

test_q = vectorizer.transform(test_df['question_text'])
test_q_df = pd.DataFrame(test_q.toarray(), columns=vectorizer.get_feature_names())
test_df.join(test_q_df)

# TF-IDF vectorizer for train and test sets on answer_text
train_a = vectorizer.fit_transform(train_df["answer_text"])
train_a_df = pd.DataFrame(train_a.toarray(), columns=vectorizer.get_feature_names())
train_df.join(train_a_df)

test_a = vectorizer.transform(test_df['answer_text'])
test_a_df = pd.DataFrame(test_a.toarray(), columns=vectorizer.get_feature_names())
test_df.join(test_a_df)


Unnamed: 0,id,question_id,subreddit,question_utc,question_text,question_score,answer_utc,answer_text,q_compound,q_pos,...,don,good,gt,just,know,like,people,really,think,time
0,273508,b907a8006a96d2ebd2355e3ac1e3d096,books,1512086400,The anti-circumcision movement has been called...,12,1512092434,Exactly. Someone says racism is objectively w...,0.7383,0.117,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.633421,0.366579,0.000000,0.000000
1,999144,65c927c2e2877d0b2aec46504a159224,AskReddit,1512086402,Naw man he just stated that there were murders...,69,1512086989,"I get that, but still want to know a connectio...",-0.8402,0.000,...,0.000000,0.000000,0.000000,0.439304,0.560696,0.000000,0.000000,0.000000,0.000000,0.000000
2,2184418,0c401b43d32bbf591a246e64d8618063,gaming,1512086403,"Yeah, any self respected sci-fi reader knows E...",10,1512087816,starring the Shrike,0.2500,0.225,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
3,1175028,4a5d9f9837fa18ad478db25b887b3e4c,IAmA,1512086403,There's nothing that prevents health insurance...,1,1512140360,ehh that's not really painting an accurate pic...,-0.6167,0.000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,1.000000,0.000000,0.000000
4,6793537,960dd387d754ea6d1c748a173d8b9a7e,Jokes,1512086403,"""Boss gotta raise my salary. It won't rise by ...",2,1512086698,"""Come on we work in a profitable bakery. It's ...",0.0000,0.000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
5,4126545,9fc8047cd1123fd5a244bc6a8f15ab6e,gaming,1512086405,That'd be a vasectomy. Pretty sure castration ...,1,1512086468,"Either one works, as long as it's permanent. T...",0.6705,0.407,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,1.000000,0.000000,0.000000,0.000000
6,356916,de033995518a151cbcd0e9059183d010,funny,1512086407,https://i.imgur.com/d2CupD9.jpg,1,1512090441,Lol nice one,0.0000,0.000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
7,3143820,93f2d99313058a12bb9ca4b9492d729d,AskReddit,1512086408,Star Wars... To be fair I've seen small piece...,57,1512089271,"The only one I've seen is the phantom menace, ...",0.1655,0.070,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
8,6738768,93f2d99313058a12bb9ca4b9492d729d,AskReddit,1512086408,Star Wars... To be fair I've seen small piece...,57,1512089503,"I'll remind you again: hey, you should watch S...",0.1655,0.070,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
9,2790247,93f2d99313058a12bb9ca4b9492d729d,AskReddit,1512086408,Star Wars... To be fair I've seen small piece...,57,1512103673,Same here. I was even given the original box s...,0.1655,0.070,...,0.000000,0.000000,0.000000,1.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000


In [55]:
set(train_df.columns) - set(test_df.columns)

{'answer_score'}