In [2]:
# Importing modules for data science and visualization
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
# NLP Libraries
import re
import string
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.tokenize import RegexpTokenizer
from nltk.stem import PorterStemmer
from nltk.stem import WordNetLemmatizer
from nltk.stem import SnowballStemmer
# ML Libraries
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.decomposition import LatentDirichletAllocation as LDA
from sklearn.metrics import accuracy_score

In [None]:
# Loading in the datasets
train = pd.read_csv("train.csv").fillna(' ')
test = pd.read_csv("test.csv").fillna(' ')
sample_submission = pd.read_csv('sample_submission.csv')

In [None]:
# Visualizing the distribution of the target 
plt.hist(train['sentiment'], label='training data');
plt.legend();
plt.title('Distribution of target labels')

In [None]:
def data_preprocessor(df):
    '''
    For preprocessing we have regularized, transformed each upper case into lower case, tokenized,
    Normalized and remove stopwords. For normalization, we have used PorterStemmer. 
    Porter stemmer transforms a sentence from this "love loving loved" to this "love love love"
    
    '''
    stop_words = set(stopwords.words('english'))
    #stop_words.append(RT)
    stemmer = PorterStemmer()
    #lemm = WordNetLemmatizer()
    Tokenized_Doc=[]
    print("Preprocessing data.........\n")
    for data in df['message']:
        review = re.sub('[^a-zA-Z]', ' ', data)
        url = re.compile(r'https?://\S+|www\.\S+')
        review = url.sub(r'',review)
        html=re.compile(r'<.*?>')
        review = html.sub(r'',review)
        emojis = re.compile("["
                           u"\U0001F600-\U0001F64F"  # emoticons
                           u"\U0001F300-\U0001F5FF"  # symbols & pictographs
                           u"\U0001F680-\U0001F6FF"  # transport & map symbols
                           u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
                           u"\U00002702-\U000027B0"
                           u"\U000024C2-\U0001F251"
                           "]+", flags=re.UNICODE)
        review = emojis.sub(r'',review)
        tokenizer = RegexpTokenizer(r'\w+')
        tokens = tokenizer.tokenize(review)
        #gen_tweets = [stemmer.stem(token) for token in tokens if not token in stop_words]
        gen_tweets = [lemm.lemmatize(token) for token in tokens if not token in stop_words]
        cleaned =' '.join(gen_tweets)
        Tokenized_Doc.append(gen_tweets)
        df['tweet tokens'] = pd.Series(Tokenized_Doc)
        #df.insert(loc=2, column='tweet tokens', value=Tokenized_Doc)
    return df

In [None]:
train_df = data_preprocessor(train)
train_df

In [None]:
X = train_df['tweet tokens']
y = train_df['sentiment']

In [None]:
data = train_df['tweet tokens']
corpus = [' '.join(i) for i in data] #create your corpus here

vectorizer=TfidfVectorizer(use_idf=True, max_df=0.95)
X = vectorizer.fit_transform(corpus)
print(vectorizer.get_feature_names())
print(X.toarray())

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
scikit_log_reg = LogisticRegression(solver='liblinear',random_state=42) #, C=5, penalty='l2',max_iter=1000)
model=scikit_log_reg.fit(X_train, y_train)

In [None]:
y_pred = model.predict(X_test)
y_pred


In [None]:
probs = model.predict_proba(X_test)
probs

In [None]:
from sklearn import metrics
print(metrics.confusion_matrix(y_test, y_pred))

In [None]:
print(metrics.classification_report(y_test, y_pred))

In [None]:
test_df = data_preprocessor(test)

In [None]:
data2 = test_df['tweet tokens']
corpus = [' '.join(i) for i in data2] #create your corpus here

tests = vectorizer.transform(corpus, copy=True)
#print(vectorizer.get_feature_names())
#print(tests.toarray())

In [None]:
pred = model.predict(tests)
predictions = pred[:]
predictions