In [1]:
import sys
import nltk
import sklearn
import pandas as pd
import numpy as np

In [2]:
from nltk.corpus import stopwords

In [3]:
from nltk.tokenize import word_tokenize

In [4]:
from sklearn.model_selection import train_test_split

In [5]:
from nltk.classify.scikitlearn import SklearnClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import SGDClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import SVC
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix

In [6]:
from sklearn.ensemble import VotingClassifier

In [7]:
import pickle
from joblib import dump, load

In [8]:
# Funcion para cargar los modelos:
model_paths = {
    "k_nearest_neighbors": "knn_model.pkl",
    "decision_tree": "decision_tree_model.pkl",
    "random_forest": "random_forest_model.pkl",
    "naive_bayes": "naive_bayes_model.pkl",
    "voting_classifier": "voting_classifier.pkl"
}

def load_models():
    loaded_models = {}
    for name, path in model_paths.items():
        with open(path, 'rb') as f:
            loaded_models[name] = pickle.load(f)
    return loaded_models


In [9]:
loaded_models = load_models()

In [10]:
loaded_models

{'k_nearest_neighbors': KNeighborsClassifier(),
 'decision_tree': DecisionTreeClassifier(),
 'random_forest': RandomForestClassifier(),
 'naive_bayes': MultinomialNB(),
 'voting_classifier': <SklearnClassifier(VotingClassifier(estimators=[('K Nearest Neighbors', KNeighborsClassifier()),
                              ('Decision Tree', DecisionTreeClassifier()),
                              ('Random Forest', RandomForestClassifier()),
                              ('Naive Bayes', MultinomialNB())],
                  n_jobs=-1))>}

In [11]:
message = pd.DataFrame({"message": [
                                    "As a woman you shouldn't complain about cleaning up your house.",
                                    "All niggas people are disgusting criminals.",
                                    "I hope all the faggots die.",
                                    "As a woman you should dedicate yourself to cleaning and scrubbing. Dirty filthy slut.",
                                    "As a woman you should dedicate yourself to cleaning and scrubbing.",
                                    "I love you.",
                                    "The other day I ate some fries that were disgusting."
                                    ]})

In [12]:
message = message['message']

Procesamos la cadena al igual que hemos hecho con el entrenamiento. Con expresiones regulares.

In [13]:
message = message.str.replace(r'^!+', '', regex=True).str.replace(r'\bRT\b\s+', '', regex=True).str.replace(r'@\w+', 'users_name', regex=True).str.strip()

In [14]:
# Remove punctuation
message = message.str.replace(r'[^\w\d\s]', ' ', regex=True)

# Replace whitespace between terms with a single space
message = message.str.replace(r'\s+', ' ')

# Remove leading and trailing whitespace
message = message.str.replace(r'^\s+|\s+?$', '')

In [15]:
message = message.str.lower()
message

0    as a woman you shouldn t complain about cleani...
1          all niggas people are disgusting criminals 
2                          i hope all the faggots die 
3    as a woman you should dedicate yourself to cle...
4    as a woman you should dedicate yourself to cle...
5                                          i love you 
6    the other day i ate some fries that were disgu...
Name: message, dtype: object

Eliminamos las stopwords.

In [16]:
stop_words = set(stopwords.words('english'))

message = message.apply(lambda x: ' '.join(term for term in x.split() if term not in stop_words))

In [17]:
message

0                        woman complain cleaning house
1                   niggas people disgusting criminals
2                                     hope faggots die
3    woman dedicate cleaning scrubbing dirty filthy...
4                    woman dedicate cleaning scrubbing
5                                                 love
6                             day ate fries disgusting
Name: message, dtype: object

Reducimos las palabras a su base.

In [18]:
#Probamos de quitar el stemming

# ps = nltk.PorterStemmer()

# message = message.apply(lambda x: ' '.join(ps.stem(term) for term in x.split()))

In [19]:
message

0                        woman complain cleaning house
1                   niggas people disgusting criminals
2                                     hope faggots die
3    woman dedicate cleaning scrubbing dirty filthy...
4                    woman dedicate cleaning scrubbing
5                                                 love
6                             day ate fries disgusting
Name: message, dtype: object

Importamos nuestro word_features

In [20]:
with open('word_features.pkl', 'rb') as f:
    word_features = pickle.load(f)

In [21]:
word_features

['name',
 'bitch',
 '128514',
 'bitches',
 'co',
 'http',
 'like',
 'hoes',
 'pussy',
 'hoe',
 '8220',
 '8221',
 'got',
 'ass',
 'get',
 'fuck',
 'u',
 'shit',
 '8230',
 'nigga',
 'trash',
 'lol',
 'amp',
 'know',
 'niggas',
 'one',
 'love',
 'na',
 'go',
 'fucking',
 '128557',
 'yo',
 'want',
 'bad',
 'man',
 'good',
 'ya',
 'say',
 'make',
 'look',
 'still',
 'see',
 'hate',
 'back',
 'im',
 'think',
 'need',
 'never',
 'time',
 'gon',
 'faggot',
 'really',
 'people',
 'girl',
 'let',
 'real',
 'right',
 'would',
 'even',
 'lmao',
 'bird',
 'white',
 'said',
 '128553',
 'wan',
 'dick',
 'wit',
 'day',
 'bout',
 'stop',
 'damn',
 'little',
 'ta',
 'tell',
 'call',
 'gt',
 'talk',
 '128175',
 'come',
 '2',
 'da',
 'life',
 'n',
 'cause',
 'dont',
 'new',
 'take',
 'dat',
 'every',
 'charlie',
 'always',
 'money',
 '8217',
 'niggah',
 'better',
 'going',
 'girls',
 'lil',
 '65039',
 'eat',
 'ghetto',
 'ever',
 '128530',
 'give',
 'dumb',
 'retarded',
 'yellow',
 'fuckin',
 'twitter',
 '

In [22]:
def find_features(message):
    words = word_tokenize(message)
    features = {}
    for word in word_features:
        features[word] = (word in words)

    return features

In [23]:
features = []

for sentence in message:
    features.append(find_features(sentence)) 


In [24]:
for feature in features:

    for key, value in feature.items():
        if value == True:
            print(key)

house
woman
complain
niggas
people
faggots
hope
die
woman
dirty
slut
woman
love
day
ate


In [25]:
list(features[0].items())[:10]

[('name', False),
 ('bitch', False),
 ('128514', False),
 ('bitches', False),
 ('co', False),
 ('http', False),
 ('like', False),
 ('hoes', False),
 ('pussy', False),
 ('hoe', False)]

In [26]:
features

[{'name': False,
  'bitch': False,
  '128514': False,
  'bitches': False,
  'co': False,
  'http': False,
  'like': False,
  'hoes': False,
  'pussy': False,
  'hoe': False,
  '8220': False,
  '8221': False,
  'got': False,
  'ass': False,
  'get': False,
  'fuck': False,
  'u': False,
  'shit': False,
  '8230': False,
  'nigga': False,
  'trash': False,
  'lol': False,
  'amp': False,
  'know': False,
  'niggas': False,
  'one': False,
  'love': False,
  'na': False,
  'go': False,
  'fucking': False,
  '128557': False,
  'yo': False,
  'want': False,
  'bad': False,
  'man': False,
  'good': False,
  'ya': False,
  'say': False,
  'make': False,
  'look': False,
  'still': False,
  'see': False,
  'hate': False,
  'back': False,
  'im': False,
  'think': False,
  'need': False,
  'never': False,
  'time': False,
  'gon': False,
  'faggot': False,
  'really': False,
  'people': False,
  'girl': False,
  'let': False,
  'real': False,
  'right': False,
  'would': False,
  'even': False

In [27]:
# message_features = np.array([list(features.values())])

In [28]:
message_features = np.array([list(f.values()) for f in features])

In [29]:
message_features

array([[False, False, False, ..., False, False, False],
       [False, False, False, ..., False, False, False],
       [False, False, False, ..., False, False, False],
       ...,
       [False, False, False, ..., False, False, False],
       [False, False, False, ..., False, False, False],
       [False, False, False, ..., False, False, False]])

In [30]:
loaded_models

{'k_nearest_neighbors': KNeighborsClassifier(),
 'decision_tree': DecisionTreeClassifier(),
 'random_forest': RandomForestClassifier(),
 'naive_bayes': MultinomialNB(),
 'voting_classifier': <SklearnClassifier(VotingClassifier(estimators=[('K Nearest Neighbors', KNeighborsClassifier()),
                              ('Decision Tree', DecisionTreeClassifier()),
                              ('Random Forest', RandomForestClassifier()),
                              ('Naive Bayes', MultinomialNB())],
                  n_jobs=-1))>}

In [31]:
message[0]

'woman complain cleaning house'

In [32]:
message[0] = [message[0]]

In [33]:
for name in loaded_models.keys():
    print(name, ': ', loaded_models[name])
    if (name != 'voting_classifier'):
        print(loaded_models[name].predict(message_features))    
    
    



k_nearest_neighbors :  KNeighborsClassifier()
[2 2 2 2 2 2 1]
decision_tree :  DecisionTreeClassifier()
[1 2 2 1 1 2 2]
random_forest :  RandomForestClassifier()
[2 2 2 2 2 2 2]
naive_bayes :  MultinomialNB()
[0 1 2 1 1 1 1]
voting_classifier :  <SklearnClassifier(VotingClassifier(estimators=[('K Nearest Neighbors', KNeighborsClassifier()),
                             ('Decision Tree', DecisionTreeClassifier()),
                             ('Random Forest', RandomForestClassifier()),
                             ('Naive Bayes', MultinomialNB())],
                 n_jobs=-1))>
