In [1]:
import numpy as np
import pandas as pd
import os
import nltk
import re

for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 5GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/sentiment-analysis-msa-phase-2/train.csv
/kaggle/input/sentiment-analysis-msa-phase-2/test.csv
/kaggle/input/sentiment-analysis-msa-phase-2/sample_submission.csv


Reading in the data files

In [2]:
train_set = pd.read_csv("/kaggle/input/sentiment-analysis-msa-phase-2/train.csv")
train_set.head()

Unnamed: 0,textID,text,sentiment
0,cb774db0d1,"I`d have responded, if I were going",neutral
1,549e992a42,Sooo SAD I will miss you here in San Diego!!!,negative
2,088c60f138,my boss is bullying me...,negative
3,9642c003ef,what interview! leave me alone,negative
4,358bd9e861,"Sons of ****, why couldn`t they put them on t...",negative


In [3]:
test_set = pd.read_csv("/kaggle/input/sentiment-analysis-msa-phase-2/test.csv")
test_set.head()

Unnamed: 0,textID,text
0,f87dea47db,Last session of the day http://twitpic.com/67ezh
1,96d74cb729,Shanghai is also really exciting (precisely -...
2,eee518ae67,"Recession hit Veronique Branquinho, she has to..."
3,01082688c6,happy bday!
4,33987a8ee5,http://twitpic.com/4w75p - I like it!!


Removing all the links and tokenizing the words

In [4]:
train_set.text = train_set.text.apply(lambda x: re.sub(r'\(?http\S+', '', str(x)))
train_set.text = train_set.text.apply(lambda x: ' '.join(nltk.word_tokenize(x)))

test_set.text = test_set.text.apply(lambda x: re.sub(r'\(?http\S+', '', str(x)))
test_set.text = test_set.text.apply(lambda x: ' '.join(nltk.word_tokenize(x)))

In [5]:
def toValue(sentiment):
    if sentiment == "negative":
        return 0
    elif sentiment == "neutral":
        return 1
    else:
        return 2

train_set.sentiment = train_set.sentiment.apply(lambda x: toValue(str(x)))
train_set.head()

Unnamed: 0,textID,text,sentiment
0,cb774db0d1,"I`d have responded , if I were going",1
1,549e992a42,Sooo SAD I will miss you here in San Diego ! ! !,0
2,088c60f138,my boss is bullying me ...,0
3,9642c003ef,what interview ! leave me alone,0
4,358bd9e861,"Sons of **** , why couldn`t they put them on t...",0


In [6]:
train_set.sentiment.value_counts()

1    11118
2     8582
0     7781
Name: sentiment, dtype: int64

In [7]:
from sklearn.feature_extraction.text import CountVectorizer
from nltk.tokenize import RegexpTokenizer

token = RegexpTokenizer(r'[a-zA-Z0-9]+')
cv = CountVectorizer(lowercase=True,stop_words='english')
text_counts = cv.fit_transform(train_set.text)

In [8]:
from sklearn.model_selection import train_test_split
X_train, X_testasdf, y_train, y_test = train_test_split(
    text_counts, train_set['sentiment'], test_size=0.3, random_state=10)

In [9]:
from sklearn.naive_bayes import MultinomialNB
from sklearn import metrics

X_test = cv.transform(test_set.text)

clf = MultinomialNB().fit(text_counts, train_set.sentiment)
predicted = clf.predict(X_test)

In [10]:
data = {"textID" : test_set.textID,
        "sentiment" : predicted}
final = pd.DataFrame(data, columns = ["textID", "sentiment"])
final.head()

Unnamed: 0,textID,sentiment
0,f87dea47db,2
1,96d74cb729,2
2,eee518ae67,0
3,01082688c6,2
4,33987a8ee5,1


In [11]:
def toSentimentValue(x):
    if x == "0":
        return "negative"
    elif x == "1":
        return "neutral"
    else:
        return "positive"

final.sentiment = final.sentiment.apply(lambda x: toSentimentValue(str(x)))
final.head()

Unnamed: 0,textID,sentiment
0,f87dea47db,positive
1,96d74cb729,positive
2,eee518ae67,negative
3,01082688c6,positive
4,33987a8ee5,neutral


In [12]:
final.to_csv("/kaggle/working/output.csv", index=False)