- https://www.kdnuggets.com/2018/11/multi-class-text-classification-doc2vec-logistic-regression.html
- https://medium.com/vickdata/detecting-hate-speech-in-tweets-natural-language-processing-in-python-for-beginners-4e591952223
- https://medium.com/@cmukesh8688/tf-idf-vectorizer-scikit-learn-dbc0244a911a
- https://medium.com/swlh/sentiment-classification-using-word-embeddings-word2vec-aedf28fbb8ca
- https://github.com/catherinewinslet/twitter-sentiment-analysis-algorithm-comparison/blob/master/Sentiment_Analysis.ipynb


In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
train_set =  pd.read_csv("train.csv")
test_set  =  pd.read_csv("test.csv")

In [3]:
train_set.head()

Unnamed: 0,TweetId,Label,TweetText
0,304271250237304833,Politics,'#SecKerry: The value of the @StateDept and @U...
1,304834304222064640,Politics,'@rraina1481 I fear so'
2,303568995880144898,Sports,'Watch video highlights of the #wwc13 final be...
3,304366580664528896,Sports,'RT @chelscanlan: At Nitro Circus at #AlbertPa...
4,296770931098009601,Sports,'@cricketfox Always a good thing. Thanks for t...


In [4]:
test_set.head()

Unnamed: 0,TweetId,TweetText
0,306486520121012224,'28. The home side threaten again through Maso...
1,286353402605228032,'@mrbrown @aulia Thx for asking. See http://t....
2,289531046037438464,'@Sochi2014 construction along the shores of t...
3,306451661403062273,'#SecKerry\u2019s remarks after meeting with F...
4,297941800658812928,'The #IPLauction has begun. Ricky Ponting is t...


In [None]:
test_set['TweetText']

In [None]:
train_set.drop(['TweetId'], axis=1, inplace=True)

In [None]:
train_set.shape

In [None]:
# Check the distriution of complaints by category
train_set.groupby('Label').count()

In [None]:
import nltk
nltk.download()

from nltk.corpus import stopwords
from wordcloud import WordCloud
from textblob import TextBlob
from textblob import Word

In [None]:
words = []
for value in train_set.TweetText.str.split(' '):
     words.extend(value)

In [None]:
#Check number of words in the data
len(words)

In [None]:
#Compute the frequency of all words in the reviews
frequency_dist = nltk.FreqDist(words)
frequency_dist

In [None]:
sorted_frequency_dist =sorted(frequency_dist, key=frequency_dist.__getitem__, reverse=True)
sorted_frequency_dist[:30]

In [None]:
#Consider words with length greater than 3 and plot
plt.figure(figsize=(22,8))
large_words = dict([(k,v) for k,v in frequency_dist.items() if len(k)>3])
frequency_dist = nltk.FreqDist(large_words)
frequency_dist.plot(50,cumulative=False)

In [None]:
from wordcloud import WordCloud

wcloud = WordCloud().generate_from_frequencies(frequency_dist) #background_color="white"

plt.figure(figsize=(22,7))
plt.imshow(wcloud, interpolation='bilinear')
plt.axis("off")
(-0.5, 399.5, 199.5, -0.5)
plt.show()

In [None]:
from gensim.models import FastText
from sklearn.decomposition import PCA
from matplotlib import pyplot

from nltk.corpus import stopwords
from nltk.stem.wordnet import WordNetLemmatizer
import string

In [None]:
stop = set(stopwords.words('english'))
exclude = set(string.punctuation)
lemma = WordNetLemmatizer()

def clean_text_data(doc):
    stop_free = " ".join([i for i in doc.lower().split() if i not in stop])
    punc_free = ''.join(ch for ch in stop_free if ch not in exclude)
    normalized = " ".join(lemma.lemmatize(word) for word in punc_free.split())
    return normalized

title_clean = [clean_text_data(doc).split() for doc in train_set.TweetText]

In [None]:
#title_clean

In [None]:
fast = FastText(title_clean,size=20, window=1, min_count=1,workers=5, min_n=1, max_n=2)

In [None]:
print(fast['president'])

In [None]:
print(fast['obama'])

In [None]:
X = fast[fast.wv.vocab]
pca = PCA(n_components=2)
result = pca.fit_transform(X)

In [None]:
print(len(result))
result = result[:80]

In [None]:
words = list(fast.wv.vocab)
len(words)

In [None]:
# create a scatter plot of the projection
plt.figure(figsize=(27,7))
pyplot.scatter(result[:, 0], result[:, 1])
plt.title('T – SNE plot')
words = list(fast.wv.vocab)[:80]
for i, word in enumerate(words):
      pyplot.annotate(word, xy=(result[i, 0], result[i, 1]))
pyplot.show()

## Preprocessing

1. Coverting text to LowerCase
2. Removing Punctuation, 
3. Removing StopWords
4. Standardizying text ( we can build our own custom dictionary to look for short and abbrev words)
5. Correction Spelling (typo errors & abbrev)
6. Tokenizing Text
7. Lemmarizing
8. Converting Text to Features
      - TF-IDF 
      - Word Embedding: Word2Vec(CBOW or Skip Gram)
      - FastText: improvised version of word2vec (evaluate the WE by T-SNE plot)
9. Build a text preprocessing pipeline

In [None]:
def tokenize_text(text):
    tokens = nltk.word_tokenize(text)
    tokens = [token.strip() for token in tokens]
    return tokens

In [None]:
#train_set.head()

In [5]:
def processTextData(tweet):
    
    import re
    import nltk
    from textblob import TextBlob
    from nltk.corpus import stopwords
    from nltk.stem import PorterStemmer
    from nltk.tokenize import TweetTokenizer
    from nltk.stem import WordNetLemmatizer
    from textblob import Word
    from nltk.util import ngrams
    from wordcloud import WordCloud, STOPWORDS
    from nltk.corpus import stopwords
    stop = stopwords.words('english')
    stemmer = WordNetLemmatizer()

    
    #Removes Punctuations and not alphanumeric symbols
    tweet = re.sub(r'[^\w\s]','', tweet)

    #Removes Hyper links
    tweet = re.sub(r'https?:\/\/S+','', tweet)
    
    #Removes unicode strings like "\u002c" and "x96"
    tweet = re.sub(r'(\\u[0-9A-Fa-f]+)',r'', tweet)
    tweet = re.sub(r'[^\x00-\x7f]',r'',tweet)

    #Removes any @mentions 
    tweet = re.sub('@[^\s]+','AT_USER',tweet)
    
    #Removes hastag in front of a word #mentions
    tweet = re.sub(r'#[A-Za-z0-9]+','', tweet)
    
    #Removes RT
    tweet = re.sub(r'RT[\s]+','', tweet)
    
    #Removes hastag in front of a word: #word
    tweet = re.sub(r'#([^\s]+)', r'\1', tweet)
    
    #Converts Text Data to LowerCase
    tweet = tweet.lower()
    
    #remove numbers
    tweet = "".join([i for i in tweet if not i.isdigit()]) 
                   
    #Removes Stop words
    tweet = " ".join(x for x in tweet.split() if x not in stop)
                   
    #Tokenizing Text
    #tweet = " ".join(x for x in tokenizer.tokenize(tweet))
    tokenizer = TweetTokenizer(preserve_case=False,strip_handles=True, reduce_len=True)
    tweet = tokenizer.tokenize(tweet)
    
    # Lemmatization
    tweet = tweet.split()

    tweet = [stemmer.lemmatize(word) for word in tweet]
    tweet = ' '.join(tweet)
    
    return tweet

In [6]:
train_set['TweetText'] = train_set['TweetText'].apply(processTextData)

AttributeError: 'list' object has no attribute 'split'

In [None]:
test_set['TweetText']  = test_set['TweetText'].apply(processTextData)

In [None]:
test_set['TweetText']

In [None]:
train_set['TweetText']

In [None]:
train_set['Label_encoding'] = train_set['Label'].map({'Politics':0, 'Sports':1})

In [None]:
X = train_set['TweetText']
y = train_set['Label_encoding']

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=0)

In [None]:
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
from sklearn.pipeline import make_pipeline

from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import SGDClassifier
from sklearn.tree import DecisionTreeClassifier 
from sklearn.svm import SVC
import xgboost  as xgb

from sklearn.metrics import confusion_matrix, classification_report, f1_score

from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer

In [None]:
models_dict = {
    
'LR_model' :  make_pipeline(
                            CountVectorizer(),
                            TfidfTransformer(),
                            SGDClassifier(random_state=0, n_jobs=-1)),
'LR_model' :  make_pipeline(
                            CountVectorizer(),
                            TfidfTransformer(),
                            SGDClassifier(random_state=0, n_jobs=-1)),
'SVC_model' : make_pipeline(
                            CountVectorizer(),
                            TfidfTransformer(),
                            SVC(random_state=0)),
'RFC_model' : make_pipeline(
                            CountVectorizer(),
                            TfidfTransformer(),
                            RandomForestClassifier(random_state=0, n_jobs=-1)),
'XGB_model' : make_pipeline(
                            CountVectorizer(),
                            TfidfTransformer(), 
                            xgb.XGBClassifier())
    
}

In [None]:
%%time

for mod_name, model in models_dict.items():
    model = model.fit(X_train, y_train)
    print('***'+ mod_name +'***')
    print('{}: Train score  {}'.format(mod_name, model.score(X_train, y_train)))
    print('{}: Test score   {}'.format(mod_name, model.score(X_test, y_test)))
    print('-----------------------------------------')

In [None]:
SVM_pipeline = make_pipeline(CountVectorizer(), TfidfTransformer(), SVC(random_state=0))
SVM_model = SVM_pipeline.fit(X_train, y_train)


LR_pipeline = make_pipeline(CountVectorizer(),TfidfTransformer(),SGDClassifier(random_state=0, n_jobs=-1))
LR_model = LR_pipeline.fit(X_train, y_train)

In [None]:
from sklearn.metrics import classification_report


pred = SVM_model.predict(X_test)
print(classification_report(y_test, pred, target_names=('Politics', 'Sports')))

## Prediction

In [None]:
#predictions = svm_model.predict(test_set['TweetText'])
#predictions

In [7]:
def decoding_predictions(label):
    if label == 0:
        return 'Politics' 
    else:
        return 'Sports'

In [None]:
test_set['Label']=[decoding_predictions(label) for label in SVM_model.predict(test_set['TweetText'])]
test_set.drop('TweetText', axis=1, inplace=True)
submission = test_set

In [None]:
submission.head()

In [None]:
#This is saved in the same directory as your notebook
filename = 'DeepTweets.csv'

submission.to_csv(filename, index=False)

print('Saved file: ' + filename)

In [None]:
train_set['TweetText'] = train_set['TweetText'].apply(word_tokenize)

In [None]:
import nltk
from nltk import word_tokenize
tokens_words = nltk.word_tokenize(train_set['TweetText'])
tokens_words

In [None]:
train_set

In [None]:
X_train = train_set['TweetText'].values
y_train = train_set['Label_encoding'].values

In [None]:
y_train

In [None]:
tokenizer.tokenize(re_chosen)

In [None]:
# TfidfVectorizer 
# CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer,CountVectorizer
import pandas as pd
# set of documents
train = ['The sky is blue.','The sun is bright.']
test = ['The sun in the sky is bright', 'We can see the shining sun, the bright sun.']
# instantiate the vectorizer object
countvectorizer = CountVectorizer(analyzer= 'word', stop_words='english')
tfidfvectorizer = TfidfVectorizer(analyzer='word',stop_words= 'english')
# convert th documents into a matrix
count_wm = countvectorizer.fit_transform(train)
tfidf_wm = tfidfvectorizer.fit_transform(train)
#retrieve the terms found in the corpora
# if we take same parameters on both Classes(CountVectorizer and TfidfVectorizer) , it will give same output of get_feature_names() methods)
#count_tokens = tfidfvectorizer.get_feature_names() # no difference
count_tokens = countvectorizer.get_feature_names()
tfidf_tokens = tfidfvectorizer.get_feature_names()
df_countvect = pd.DataFrame(data = count_wm.toarray(),index = ['Doc1','Doc2'],columns = count_tokens)
df_tfidfvect = pd.DataFrame(data = tfidf_wm.toarray(),index = ['Doc1','Doc2'],columns = tfidf_tokens)
print("Count Vectorizer\n")
print(df_countvect)
print("\nTD-IDF Vectorizer\n")
print(df_tfidfvect)

In [None]:
import re

def cleantxt(tweets):
    tweets = re.sub(r'[^\w\s]','', tweets)# REMOVE Punctuation
    tweets = re.sub(r'@[A-Za-z0-9]+','', tweets) 
    tweets = re.sub(r'#[A-Za-z0-9]+','', tweets) # REMOVE patterns like @word
    tweets = re.sub(r'RT[\s]+','', tweets)
    tweets = re.sub(r'https?:\/\/S+','', tweets)
    
    return tweets

In [None]:
train_set['TweetText'] = train_set['TweetText'].apply(cleantxt)

In [None]:
import nltk
nltk.download()

In [None]:
# Converting Text Data to LowerCase
train_set['TweetText'] = train_set['TweetText'].apply(lambda x: " ".join(x.lower() for x in x.split()))

# Removing Stop Words
from nltk.corpus import stopwords

stop = stopwords.words('english')
train_set['TweetText'] = train_set['TweetText'].apply(lambda x: " ".join(x for x in x.split() if x not in stop))

#Standardizating Text ---> improvement
# Correcting Spelling
#from textblob import TextBlob
#train_set['TweetText'] = train_set['TweetText'].apply(lambda x: str(TextBlob(x).correct()))

##Tokenizing Text
from nltk.tokenize import word_tokenize
train_set['TweetText'] = train_set['TweetText'].apply(word_tokenize)

##Lemmatizing
from textblob import Word
#tweet =" ".join([Word(word).lemmatize() for word in tweet.split()])
#train_set['TweetText'].apply(lambda x: " ".join([Word(word).lemmatize() for word in x.split()]))