In [24]:
import numpy as np
import pandas as pd

# Import LDA, count vectorizer, and dataset from sklearn.
from sklearn.decomposition import LatentDirichletAllocation
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.datasets import fetch_20newsgroups
from sklearn.linear_model import LogisticRegression

# Import plotting modules.
import matplotlib.pyplot as plt
import seaborn as sns
# This ensures that the plots show up in the notebook. 
%matplotlib inline  

In [8]:
# This function will print out the top words for each topic.
def print_top_words(lda_model, vocabulary, n_top_words):
    for topic_idx, topic in enumerate(lda_model.components_):
        print "Topic %d top %d words:" % (topic_idx, n_top_words)
        print ", ".join([vocabulary[i] for i in topic.argsort()[:-n_top_words-1:-1]])
        print
        

In [9]:
def print_topic_importance(transformed_corpus):
    print  '   '.join([("Topic %d" % t) for t in range(transformed_corpus.shape[1])])
    for document_topic_weights in compressed_corpus:
        print '      '.join([str(round(document_topic_weights[t], 2)) for t in range(transformed_corpus.shape[1])])
       

#  Analyzing topics in Spam data

In [3]:
path = r"C:\Users\Javier\Desktop\Coursera\Python\General_Assembly\Data Sets\spam.csv"

spam_ham = pd.read_csv(path, encoding= 'latin1')
spam_ham = spam_ham[spam_ham.v1 == "spam"]
spam_ham.head(3)

Unnamed: 0,v1,v2,Unnamed: 2,Unnamed: 3,Unnamed: 4
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,,,
5,spam,FreeMsg Hey there darling it's been 3 week's n...,,,
8,spam,WINNER!! As a valued network customer you have...,,,


In [4]:
# counting frequency of words inside your corpus
vectorizer = CountVectorizer(stop_words='english', min_df=2, ngram_range=(1, 1))

# here you're fitting your data into the model
vectorized_corpus = vectorizer.fit_transform(spam_ham.iloc[:,1])

In [5]:
# Define the model object.
lda_model = LatentDirichletAllocation(n_topics=10, learning_method='batch')

# Fit the model on the data and transform the data.
compressed_corpus = lda_model.fit_transform(vectorized_corpus)

In [10]:
# Most common words within the Spam data set
vocabulary = vectorizer.get_feature_names()
print_top_words(lda_model, vocabulary, 5)

Topic 0 top 5 words:
nokia, ur, week, free, 16

Topic 1 top 5 words:
prize, min, contact, claim, won

Topic 2 top 5 words:
prize, won, claim, customer, stop

Topic 3 top 5 words:
stop, free, ur, mobile, text

Topic 4 top 5 words:
free, games, ur, special, reveal

Topic 5 top 5 words:
reply, text, free, video, new

Topic 6 top 5 words:
free, orange, latest, mins, double

Topic 7 top 5 words:
free, txt, www, ur, com

Topic 8 top 5 words:
txt, win, www, 100, entry

Topic 9 top 5 words:
claim, holiday, cash, award, sae



# Analyzing topics in Non-Spam data

In [11]:
ham_data = pd.read_csv(path, encoding= 'latin1')
ham_data = ham_data[ham_data.v1 == "ham"]
ham_data.head(3)

Unnamed: 0,v1,v2,Unnamed: 2,Unnamed: 3,Unnamed: 4
0,ham,"Go until jurong point, crazy.. Available only ...",,,
1,ham,Ok lar... Joking wif u oni...,,,
3,ham,U dun say so early hor... U c already then say...,,,


In [12]:
# counting frequency of words inside your corpus
vectorizer = CountVectorizer(stop_words='english', min_df=2, ngram_range=(1, 1))

# here you're fitting your data into the model
vectorized_corpus = vectorizer.fit_transform(ham_data.iloc[:,1])

In [13]:
# Define the model object.
lda_model = LatentDirichletAllocation(n_topics=10, learning_method='batch')

# Fit the model on the data and transform the data.
compressed_corpus = lda_model.fit_transform(vectorized_corpus)

In [14]:
# Most common words within the Spam data set
vocabulary = vectorizer.get_feature_names()
print_top_words(lda_model, vocabulary, 5)

Topic 0 top 5 words:
later, sorry, ll, gonna, life

Topic 1 top 5 words:
ok, work, night, doing, right

Topic 2 top 5 words:
day, good, love, hope, send

Topic 3 top 5 words:
gt, lt, did, like, got

Topic 4 top 5 words:
don, know, da, want, going

Topic 5 top 5 words:
hi, dear, ur, happy, dont

Topic 6 top 5 words:
just, know, need, dont, pls

Topic 7 top 5 words:
come, want, text, tomorrow, ll

Topic 8 top 5 words:
way, just, ì_, come, ìï

Topic 9 top 5 words:
lor, home, ok, wat, ì_



# Feeding Logistic Regression

In [15]:
#Building the train and test sets!
from sklearn.cross_validation import train_test_split
from sklearn.preprocessing import StandardScaler

data_set = pd.read_csv(path, encoding= 'latin1')
dummies = pd.get_dummies(data_set.iloc[:,0])
data_set = data_set.join(dummies)

x = vectorized_corpus = vectorizer.fit_transform(data_set.iloc[:,1])
y = np.array(data_set.iloc[:,-2])

In [16]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.2, random_state = 42)

In [17]:
from sklearn.linear_model import LogisticRegression
model = LogisticRegression(penalty = 'l2', C=1)

In [18]:
model.fit(x_train, y_train)

LogisticRegression(C=1, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [19]:
from sklearn.metrics import accuracy_score
from sklearn.metrics import roc_auc_score
print "Logistic Reg accuracy is %2.2f" % accuracy_score(y_test, model.predict(x_test))
print "AUC is %2.2f" % roc_auc_score(y_test, model.predict(x_test))

Logistic Reg accuracy is 0.98
AUC is 0.93


In [20]:
a = model.predict_proba(x_test)

predictions = pd.DataFrame(a)
predictions.head()

Unnamed: 0,0,1
0,0.031791,0.968209
1,0.029396,0.970604
2,0.085015,0.914985
3,0.008764,0.991236
4,0.99758,0.00242


# Trying Naive Bayes

In [21]:
from sklearn.naive_bayes import GaussianNB
gnb = GaussianNB()

In [22]:
y_pred = gnb.fit(x.toarray(), y).predict(x.toarray())

In [23]:
print("Number of mislabeled points out of a total %d points : %d"% (x.toarray().shape[0],(y != y_pred).sum()))

Number of mislabeled points out of a total 5572 points : 544
