<h1>Proyecto 1</h1>

In [3]:
import pandas as pd
import nltk
import contractions
import inflect
import unicodedata, re
from nltk.corpus import stopwords
from sklearn.model_selection import train_test_split
import numpy as np
import matplotlib.pyplot as plt
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.metrics import accuracy_score as AS
from sklearn.metrics import f1_score as F1
from sklearn.metrics import precision_score as PS
from sklearn.metrics import recall_score as RS
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay, make_scorer
from sklearn.ensemble import RandomForestClassifier

Se espera que corra en su computador el comando: 
- pip install contractions 
- pip install inflect


<h1>1. Entendimiento y preparación de datos</h1>

In [4]:
ruta = "./data/SuicidiosProyecto.csv"
df = pd.read_csv(ruta, encoding="utf-8", index_col=0)
df.columns

Index(['text', 'class'], dtype='object')

In [5]:
df.sample(5)

Unnamed: 0_level_0,text,class
Unnamed: 0,Unnamed: 1_level_1,Unnamed: 2_level_1
319701,Having a girlfriend in a dream seems better th...,non-suicide
207064,I'd rather die than live like this.I really wo...,suicide
212566,Point me to the right direction?Saw some info ...,suicide
197805,The downvote button should only be used if a c...,non-suicide
64754,"Anyone else feeling just sad? I just feel sad,...",non-suicide


In [6]:
df.dtypes

text     object
class    object
dtype: object

In [7]:
df['class'].value_counts(dropna=False, normalize=True)

non-suicide    0.562928
suicide        0.437072
Name: class, dtype: float64

In [19]:
def remove_non_ascii(word):
    """Remove non-ASCII characters from tokenized word"""
    new_word = unicodedata.normalize('NFKD', word).encode('ascii', 'ignore').decode('utf-8', 'ignore')
    return new_word

def to_lowercase(word):
    """Convert all characters to lowercase from tokenized word"""
    new_word = word.lower()
    return new_word

def remove_punctuation(word):
    """Remove punctuation from tokenized word"""
    new_word = re.sub(r'[^\w\s]', '', word)
    return new_word

def replace_numbers(word):
    """Replace all interger occurrences in tokenized word with textual representation"""
    p = inflect.engine()
    if word.isdigit():
        new_word = p.number_to_words(word)
    else:
        new_word = word
    return new_word

def remove_contractions(word):
    """Remove contractions tokenized word"""
    new_word = contractions.fix(word)
    return new_word

def preprocessing(words):
    new_words = []
    for word in words:
        new_word = remove_non_ascii(word)
        print("ascii")
        new_word = to_lowercase(new_word)
        print("lowercase")
        new_word = remove_punctuation(new_word)
        print("puntuation")
        if new_word != '':
            #new_word = replace_numbers(new_word)
            if new_word not in stopwords.words('english'):
                new_word = remove_contractions(new_word)
                print("contraction")
                new_words.append(new_word)
    return new_words

In [21]:
df['words'] = df['text'].apply(nltk.word_tokenize).apply(preprocessing) #Aplica la eliminación del ruido
df.head()

In [None]:
x_train, x_test, y_train, y_test = train_test_split(df['text'], df['class'], test_size=0.3, random_state=28)

In [None]:
fig, ax = plt.subplots(1, 2, figsize=(12, 12))
ax[0].pie(y_train.value_counts(), labels=y_train.value_counts().index, autopct='%1.1f%%')
ax[0].set_title('Train')
ax[1].pie(y_test.value_counts(), labels=y_test.value_counts().index, autopct='%1.1f%%')
ax[1].set_title('Test')
plt.show()

In [None]:
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')
stopwords = set(stopwords.words('english'))
stopwords.union({'4661', 'meeeeeeeeeeee', 'ja', '01457654035', 'reaally', '3624', '3904512441', 'mesooo', 'ان فرع', ''})

In [None]:
def tokenize(text):
    tokenizer = nltk.TweetTokenizer()
    return tokenizer.tokenize(text)

In [None]:
bow = CountVectorizer(tokenizer=tokenize, stop_words=stopwords)
x_train_bow = bow.fit_transform(x_train)
x_test_bow = bow.transform(x_test)
len(bow.get_feature_names())

In [None]:
#tfidf = TfidfVectorizer(tokenizer=tokenize, stop_words=stopwords)
#x_train_tfidf = tfidf.fit_transform(x_train)
#x_test_tfidf = tfidf.transform(x_test)
#len(tfidf.get_feature_names())

<h3>Random forest usando BoW</h3>

In [None]:
bow_model = RandomForestClassifier(random_state = 2, n_estimators= 12, max_depth=10)

In [None]:
bow_model.fit(x_train_bow, y_train)

In [None]:
# La 'feature importance' en modelos basados en árboles de decisión indican cuales son las características más importantes al momento de tomar una decisión
pd.Series(bow_model.feature_importances_, index = bow.vocabulary_).sort_values().tail(40).plot.barh(figsize = (15, 10))

In [None]:
bow_estimators = bow_model.estimators_
print('Number of trees:', len(bow_estimators))
print('Trees depth (mean):', np.mean([tree.get_depth() for tree in bow_estimators]))

In [None]:
y_train_bow_predict = bow_model.predict(x_train_bow)
y_test_bow_predict = bow_model.predict(bow.transform(x_test))

In [None]:
ConfusionMatrixDisplay.from_predictions(y_train, y_train_bow_predict)

In [None]:
ConfusionMatrixDisplay.from_predictions(y_test, y_test_bow_predict)

In [None]:
from sklearn.metrics import ConfusionMatrixDisplay, precision_score, recall_score, f1_score
import pickle

In [None]:
if len(bow_model.classes_) == 2:
    print('Precision:', precision_score(y_train, y_train_bow_predict, pos_label = 'suicide'))
    print('Recall:', recall_score(y_train, y_train_bow_predict, pos_label = 'suicide'))
    print('F1:', f1_score(y_train, y_train_bow_predict, pos_label = 'suicide'))
else:
    # Para casos no binarios, es común calcular las métricas para cada clase
    print('Precision:', precision_score(y_train, y_train_bow_predict, average = None))
    print('Recall:', recall_score(y_train, y_train_bow_predict, average = None))
    print('F1:', f1_score(y_train, y_train_bow_predict, average = None))

In [None]:
if len(bow_model.classes_) == 2:
    print('Precision:', precision_score(y_test, y_test_bow_predict, pos_label = 'suicide'))
    print('Recall:', recall_score(y_test, y_test_bow_predict, pos_label = 'suicide'))
    print('F1:', f1_score(y_test, y_test_bow_predict, pos_label = 'suicide'))
else:
    # Para casos no binarios, es común calcular las métricas para cada clase
    print('Precision:', precision_score(y_test, y_test_bow_predict, average = None))
    print('Recall:', recall_score(y_test, y_test_bow_predict, average = None))
    print('F1:', f1_score(y_test, y_test_bow_predict, average = None))

In [None]:
from datetime import datetime
now = datetime.now()
date = now.date()
time = now.time().strftime("%H-%M-%S")

In [None]:
nombre="modelo_dia_"+str(date)+"_hora_"+str(time)+".pkl"
pickle.dump(bow_model,open(nombre,"wb"))

In [None]:
#f=open("primermodelo.pkl",'rb')
#arbolito = pickle.load(f)
#print(arbolito)