# Part 1 - Working with Text Data

### Use Python string methods remove irregular whitespace from the following string:

In [146]:
whitespace_string = "\n\n  This is a    string   that has  \n a lot of  extra \n   whitespace.   "

print(whitespace_string)



  This is a    string   that has  
 a lot of  extra 
   whitespace.   


In [147]:
print(" ".join(whitespace_string.split()))

This is a string that has a lot of extra whitespace.


### Use Regular Expressions to take the dates in the following .txt file and put them into a dataframe with columns for:

[RegEx dates.txt](https://github.com/ryanleeallred/datasets/blob/master/dates.txt)

- Day
- Month
- Year


In [148]:
import re
import io
import pandas as pd
import requests
import gensim
from gensim.models.word2vec import Word2Vec
from nltk.tokenize import word_tokenize
import string
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score, roc_auc_score
from sklearn.linear_model import LogisticRegression

In [149]:
url = "https://raw.githubusercontent.com/ryanleeallred/datasets/master/dates.txt"
text = requests.get(url).text

In [150]:
ml = re.findall("(\w+)\s*(\d{1,2})\,\s*(\d+)", text)
df = pd.DataFrame(ml, columns = ['Month','Day','Year'])

# Part 2 - Bag of Words 

### Use the twitter sentiment analysis dataset found at this link for the remainder of the Sprint Challenge:

[Twitter Sentiment Analysis Dataset](https://raw.githubusercontent.com/ryanleeallred/datasets/master/twitter_sentiment_binary.csv)

 ### Clean and tokenize the documents ensuring the following properties of the text:

1) Text should be lowercase.

2) Stopwords should be removed.

3) Punctuation should be removed.

4) Tweets should be tokenized at the word level. 

(The above don't necessarily need to be completed in that specific order.)

### Output some cleaned tweets so that we can see that you made all of the above changes.


In [151]:
url = "https://raw.githubusercontent.com/ryanleeallred/datasets/master/twitter_sentiment_binary.csv"

In [152]:
text = requests.get(url).text

In [153]:
df = pd.read_csv(url)

In [154]:
stop_words = set(stopwords.words('english'))

# turn a doc into clean tokens
def clean_doc(doc):
	# split into tokens by white space
	tokens = doc.split()
	# remove punctuation from each token
	table = str.maketrans('', '', string.punctuation)
	tokens = [w.translate(table) for w in tokens]
	# remove remaining tokens that are not alphabetic
	tokens = [word for word in tokens if word.isalpha()]
	# filter out stop words
# 	stop_words = set(stopwords.words('english'))
	tokens = [w for w in tokens if not w in stop_words]
	# filter out short tokens
	tokens = [word.lower() for word in tokens if len(word) > 1]
	return tokens

def clean_sentence(doc):
	# split into tokens by white space
	tokens = doc.split()
	# remove punctuation from each token
	table = str.maketrans('', '', string.punctuation)
	tokens = [w.translate(table) for w in tokens]
	# remove remaining tokens that are not alphabetic
	tokens = [word for word in tokens if word.isalpha()]
	# filter out stop words
# 	stop_words = set(stopwords.words('english'))
	tokens = [w for w in tokens if not w in stop_words]
	# filter out short tokens
	tokens = [word.lower() for word in tokens if len(word) > 1]
	return " ".join(tokens)

df['cleaned'] = df.SentimentText.apply(clean_doc)
df['cleanedSentence'] = df.SentimentText.apply(clean_sentence)
print(df.shape)
df.head()

(99989, 4)


Unnamed: 0,Sentiment,SentimentText,cleaned,cleanedSentence
0,0,is so sad for my APL frie...,"[sad, apl, friend]",sad apl friend
1,0,I missed the New Moon trail...,"[missed, new, moon, trailer]",missed new moon trailer
2,1,omg its already 7:30 :O,"[omg, already]",omg already
3,0,.. Omgaga. Im sooo im gunna CRy. I'...,"[omgaga, im, sooo, im, gunna, cry, ive, dentis...",omgaga im sooo im gunna cry ive dentist since ...
4,0,i think mi bf is cheating on me!!! ...,"[think, mi, bf, cheating, tt]",think mi bf cheating tt


In [155]:
vectorizer = TfidfVectorizer(max_features=10000, ngram_range=(1,1), stop_words='english')
vectorizer.fit(df.cleanedSentence)

TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.float64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=10000, min_df=1,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=True,
        stop_words='english', strip_accents=None, sublinear_tf=False,
        token_pattern='(?u)\\b\\w\\w+\\b', tokenizer=None, use_idf=True,
        vocabulary=None)

### How should TF-IDF scores be interpreted? How are they calculated?

In TF-IDF the higher rated words have more discriminative power than the lower rated words because they more specifically identify the category.    The TF-IDF for a term is the product of its term frequency and the scaled inverse of its document frequency.

In [156]:
X_train = df.cleanedSentence

In [157]:
train_word_counts = vectorizer.transform(X_train)
X_train_vectorized = pd.DataFrame(train_word_counts[0:10000].toarray(), columns=vectorizer.get_feature_names())
X_train_vectorized = X_train_vectorized.fillna(0)
print(X_train_vectorized.shape)
X_train_vectorized.head()

(10000, 10000)


Unnamed: 0,aa,aaa,aaah,aaahh,aafreen,aah,aahhh,aalaap,aamyhaanson,aaron,...,zero,zip,zombie,zombies,zomg,zone,zones,zoo,zoom,zune
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


# Part 3 - Document Classification

1) Use Train_Test_Split to create train and test datasets.

2) Vectorize the tokenized documents using your choice of vectorization method. 

 - Stretch goal: Use both of the methods that we talked about in class.

3) Create a vocabulary using the X_train dataset and transform both your X_train and X_test data using that vocabulary.

4) Use your choice of binary classification algorithm to train and evaluate your model's accuracy. Report both train and test accuracies.

 - Stretch goal: Use an error metric other than accuracy and implement/evaluate multiple classifiers.



In [158]:
dfs = df.sample(frac=0.1)
X = dfs.cleanedSentence
y = dfs.Sentiment.values
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [159]:
print(X_train.shape)
print(X_test.shape)
print(y_train.shape)
print(y_test.shape)

(7999,)
(2000,)
(7999,)
(2000,)


In [160]:
vectorizer = CountVectorizer(max_features=1000, ngram_range=(1,1), stop_words='english')
vectorizer.fit(X_train)

CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=1000, min_df=1,
        ngram_range=(1, 1), preprocessor=None, stop_words='english',
        strip_accents=None, token_pattern='(?u)\\b\\w\\w+\\b',
        tokenizer=None, vocabulary=None)

In [161]:
train_word_counts = vectorizer.transform(X_train)
X_train_vectorized = pd.DataFrame(train_word_counts.toarray(), columns=vectorizer.get_feature_names())

test_word_counts = vectorizer.transform(X_test)
X_test_vectorized = pd.DataFrame(test_word_counts.toarray(), columns=vectorizer.get_feature_names())
print(X_train_vectorized.shape)
print(X_test_vectorized.shape)

(7999, 1000)
(2000, 1000)


In [162]:
df.Sentiment.unique()

array([0, 1])

In [137]:
XGB = XGBClassifier(n_estimators=200, objective="binary:logistic").fit(X_train_vectorized, y_train)
train_predictions = XGB.predict(X_train_vectorized)
test_predictions = XGB.predict(X_test_vectorized)
print(f'Train Accuracy: {accuracy_score(y_train, train_predictions)}')
print(f'Test Accuracy: {accuracy_score(y_test, test_predictions)}')
print(f'Train Roc Auc: {roc_auc_score(y_train, train_predictions)}')
print(f'Test Roc Auc: {roc_auc_score(y_test, test_predictions)}')

Train Accuracy: 0.7029628703587949
Test Accuracy: 0.668
Train Roc Auc: 0.674452463770322
Test Roc Auc: 0.6434984820102061


In [138]:
RFC = RandomForestClassifier(n_estimators=200).fit(X_train_vectorized, y_train)

train_predictions = RFC.predict(X_train_vectorized)
test_predictions = RFC.predict(X_test_vectorized)

print(f'Train Accuracy: {accuracy_score(y_train, train_predictions)}')
print(f'Test Accuracy: {accuracy_score(y_test, test_predictions)}')
print(f'Train Roc Auc: {roc_auc_score(y_train, train_predictions)}')
print(f'Test Roc Auc: {roc_auc_score(y_test, test_predictions)}')

Train Accuracy: 0.9523690461307663
Test Accuracy: 0.6885
Train Roc Auc: 0.9482529265359557
Test Roc Auc: 0.6829379562043796


In [139]:
LR = LogisticRegression(random_state=42, solver="newton-cg").fit(X_train_vectorized, y_train)

train_predictions = LR.predict(X_train_vectorized)
test_predictions = LR.predict(X_test_vectorized)
print(f'Train Accuracy: {accuracy_score(y_train, train_predictions)}')
print(f'Test Accuracy: {accuracy_score(y_test, test_predictions)}')
print(f'Train Roc Auc: {roc_auc_score(y_train, train_predictions)}')
print(f'Test Roc Auc: {roc_auc_score(y_test, test_predictions)}')

Train Accuracy: 0.7688461057632204
Test Accuracy: 0.6925
Train Roc Auc: 0.7568833533763624
Test Roc Auc: 0.6816460499967703


# Part 4 -  Word2Vec

1) Fit a Word2Vec model on your cleaned/tokenized twitter dataset. 

2) Display the 10 words that are most similar to the word "twitter"

In [140]:
df = pd.read_csv(url)

In [141]:
df['cleaned'] = df.SentimentText.apply(clean_doc)
print(df.shape)
df.head()

(99989, 3)


Unnamed: 0,Sentiment,SentimentText,cleaned
0,0,is so sad for my APL frie...,"[sad, apl, friend]"
1,0,I missed the New Moon trail...,"[missed, new, moon, trailer]"
2,1,omg its already 7:30 :O,"[omg, already]"
3,0,.. Omgaga. Im sooo im gunna CRy. I'...,"[omgaga, im, sooo, im, gunna, cry, ive, dentis..."
4,0,i think mi bf is cheating on me!!! ...,"[think, mi, bf, cheating, tt]"


In [142]:
w2v = Word2Vec(df.cleaned) # no parameters made the best result in my opinion, only with that include "twitt"

In [143]:
w2v.wv.most_similar('twitter', topn=10)

[('list', 0.8101412057876587),
 ('facebook', 0.8083671927452087),
 ('following', 0.7973533868789673),
 ('brittanyasnow', 0.777572751045227),
 ('link', 0.7775703072547913),
 ('dm', 0.7766029238700867),
 ('awalliewall', 0.766107439994812),
 ('page', 0.761616051197052),
 ('grats', 0.7601321935653687),
 ('aboard', 0.7586065530776978)]

In [144]:
w2v.wv.doesnt_match(['twitter', 'facebook', 'moon'])

'twitter'

In [145]:
w2v.wv.most_similar(positive=["twitter", "facebook"], negative=["moon"], topn=10)

[('email', 0.7282893657684326),
 ('dm', 0.7136653065681458),
 ('list', 0.6973550319671631),
 ('following', 0.6965770721435547),
 ('sent', 0.6892113089561462),
 ('celiaistall', 0.6811996698379517),
 ('followers', 0.6794509291648865),
 ('follow', 0.6751104593276978),
 ('link', 0.6682579517364502),
 ('message', 0.6654179096221924)]