In [140]:
import pandas as pd
from nltk.corpus import stopwords
import seaborn as sns
import numpy as np
import spacy 
from sklearn.feature_extraction.text import CountVectorizer
import re
import nltk
from nltk.stem import PorterStemmer
from nltk.stem import LancasterStemmer
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.stem.snowball import SnowballStemmer
from sklearn.neighbors import NearestNeighbors
from sklearn.linear_model import LogisticRegression

In [141]:
import matplotlib.pyplot as plt

In [142]:
friends = pd.read_csv("friends_quotes.csv")

In [143]:
main_characters = ["Monica", "Joey", "Phoebe", "Ross", "Chandler", "Rachel"] 

In [144]:
dataset= friends[friends.author.isin(main_characters)].drop(columns=['episode_number', 'episode_title', "quote_order", "season"]).reset_index(drop = True)
dataset #dataset para trabajar 

Unnamed: 0,author,quote
0,Monica,There's nothing to tell! He's just some guy I ...
1,Joey,"C'mon, you're going out with the guy! There's ..."
2,Chandler,"All right Joey, be nice. So does he have a hum..."
3,Phoebe,"Wait, does he eat chalk?"
4,Phoebe,"Just, 'cause, I don't want her to go through w..."
...,...,...
45477,Chandler,"Oh, it's gonna be okay."
45478,Rachel,(crying) Do you guys have to go to the new hou...
45479,Monica,We got some time.
45480,Rachel,"Okay, should we get some coffee?"


In [145]:
dataset["author"].value_counts()

Rachel      8318
Ross        8088
Monica      7516
Chandler    7488
Joey        7373
Phoebe      6699
Name: author, dtype: int64

## Naive_bayes (1st trial)

In [111]:
#Code https://www.geeksforgeeks.org/

corpus = []
  
for i in range(0, 45482):
    text = re.sub('[^a-zA-Z]', '', dataset['quote'][i])
    text = text.lower()
    text = text.split()
    ps = PorterStemmer()
    text = ''.join(text)
    corpus.append(text)
  
# creating bag of words model
cv = CountVectorizer(max_features = 1500)
  
X = cv.fit_transform(corpus).toarray()
y = dataset.iloc[:, 0].values

In [112]:
X

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]], dtype=int64)

In [113]:
# splitting the data set into training set and test set
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
           X, y, test_size = 0.25, random_state = 0)
# fitting naive bayes to the training set
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import confusion_matrix
  
classifier = GaussianNB();
classifier.fit(X_train, y_train)
  
# predicting test set results
y_pred = classifier.predict(X_test)
  
# making the confusion matrix
cm = confusion_matrix(y_test, y_pred)
cm

array([[  59,   25,   12, 1720,   14,    8],
       [  13,   54,   11, 1708,   20,    8],
       [  13,   21,   53, 1766,   16,    9],
       [   8,   22,   16, 1599,   13,    6],
       [   9,   25,   31, 1999,   59,   11],
       [  12,   28,   25, 1887,   23,   68]], dtype=int64)

In [114]:
pd.DataFrame(y_test).value_counts()

Rachel      2134
Ross        2043
Monica      1878
Chandler    1838
Joey        1814
Phoebe      1664
dtype: int64

In [115]:
pd.DataFrame(y_pred).value_counts()

Phoebe      10679
Joey          175
Monica        148
Rachel        145
Chandler      114
Ross          110
dtype: int64

## Naive_bayes (2nd trial)

In [148]:
wordnet_lemmatizer = WordNetLemmatizer()

def lemSentence(df):
    sentence_words = nltk.word_tokenize(df)
    lem_sentence=[]
    for word in sentence_words:
        lem_sentence.append(wordnet_lemmatizer.lemmatize(word, pos="v"))
        lem_sentence.append(" ")
        
    return "".join(lem_sentence)

In [149]:
englishStemmer=SnowballStemmer("english", ignore_stopwords=True)

def stemSentence(df):
    token_words=word_tokenize(df)
    token_words
    stem_sentence=[]
    for word in token_words:
        stem_sentence.append(englishStemmer.stem(word))
        stem_sentence.append(" ")
    return "".join(stem_sentence)

In [150]:
dataset

Unnamed: 0,author,quote
0,Monica,There's nothing to tell! He's just some guy I ...
1,Joey,"C'mon, you're going out with the guy! There's ..."
2,Chandler,"All right Joey, be nice. So does he have a hum..."
3,Phoebe,"Wait, does he eat chalk?"
4,Phoebe,"Just, 'cause, I don't want her to go through w..."
...,...,...
45477,Chandler,"Oh, it's gonna be okay."
45478,Rachel,(crying) Do you guys have to go to the new hou...
45479,Monica,We got some time.
45480,Rachel,"Okay, should we get some coffee?"


In [151]:
dataset["quote"] = dataset["quote"].str.replace("\(([^)]+)\)","").str.lower().str.replace("[^a-zA-Z ]","").apply(lemSentence).apply(stemSentence)

In [152]:
from sklearn.feature_extraction.text import TfidfVectorizer
tfidf = TfidfVectorizer(min_df=0.005, max_df=0.95, max_features = 1500 )

In [153]:
X = tfidf.fit_transform(dataset["quote"]).toarray() 
y = dataset.iloc[:, 0].values

In [122]:
X_train, X_test, y_train, y_test = train_test_split(
           X, y, test_size = 0.25, random_state = 0)

classifier = GaussianNB();
classifier.fit(X_train, y_train)
  
y_pred = classifier.predict(X_test)
  
cm = confusion_matrix(y_test, y_pred)
cm

array([[391, 376, 207, 229, 469, 166],
       [265, 484, 193, 243, 465, 164],
       [280, 327, 283, 290, 534, 164],
       [237, 293, 207, 312, 479, 136],
       [291, 334, 237, 289, 818, 165],
       [309, 404, 213, 257, 568, 292]], dtype=int64)

In [123]:
pd.DataFrame(y_pred).value_counts()

Rachel      3333
Joey        2218
Chandler    1773
Phoebe      1620
Monica      1340
Ross        1087
dtype: int64

In [124]:
pd.DataFrame(y_test).value_counts()

Rachel      2134
Ross        2043
Monica      1878
Chandler    1838
Joey        1814
Phoebe      1664
dtype: int64

## Decison Tree

In [156]:
X_train, X_test, y_train, y_test = train_test_split(
           X, y, test_size = 0.25, random_state = 0)

In [157]:
from sklearn import tree
clf = tree.DecisionTreeClassifier()
clf = clf.fit(X_train, y_train)

In [158]:
y_pred = clf.predict(X_test)

from sklearn import metrics

print("Accuracy:",metrics.accuracy_score(y_test, y_pred))

Accuracy: 0.22108873450004396


## Logistic Regression

In [159]:
lg = LogisticRegression(max_iter = 10000000)

In [160]:
clf = LogisticRegression(random_state=0).fit(X_train, y_train) 

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
