# Predicción de insultos en Twitter

In [1]:
import re, string
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.svm import LinearSVC

## Cargar Los Datos Crudos

In [2]:
def clean_text(text):
    text = text.lower()
    text = re.findall(r'\b[a-z]+\b', text)
    return ' '.join(text)

In [3]:
training_data = pd.read_csv('train.csv')
training_data.head()

Unnamed: 0,Insult,Date,Comment
0,1,20120618192155Z,"""You fuck your dad."""
1,0,20120528192215Z,"""i really don't understand your point.\xa0 It ..."
2,0,,"""A\\xc2\\xa0majority of Canadians can and has ..."
3,0,,"""listen if you dont wanna get married to a man..."
4,0,20120619094753Z,"""C\xe1c b\u1ea1n xu\u1ed1ng \u0111\u01b0\u1edd..."


In [4]:
training_data['cleaned_comment'] = training_data['Comment'].map(clean_text)

In [5]:
training_data.head()

Unnamed: 0,Insult,Date,Comment,cleaned_comment
0,1,20120618192155Z,"""You fuck your dad.""",you fuck your dad
1,0,20120528192215Z,"""i really don't understand your point.\xa0 It ...",i really don t understand your point it seems ...
2,0,,"""A\\xc2\\xa0majority of Canadians can and has ...",a of canadians can and has been wrong before n...
3,0,,"""listen if you dont wanna get married to a man...",listen if you dont wanna get married to a man ...
4,0,20120619094753Z,"""C\xe1c b\u1ea1n xu\u1ed1ng \u0111\u01b0\u1edd...",c b xu bi t xecnh c ho kh nc ng d ng cu xed ch...


In [6]:
training_data[training_data['Insult'] == 1].head()

Unnamed: 0,Insult,Date,Comment,cleaned_comment
0,1,20120618192155Z,"""You fuck your dad.""",you fuck your dad
7,1,,"""shut the fuck up. you and the rest of your fa...",shut the fuck up you and the rest of your fagg...
8,1,20120502173553Z,"""Either you are fake or extremely stupid...may...",either you are fake or extremely stupid maybe ...
9,1,20120620160512Z,"""That you are an idiot who understands neither...",that you are an idiot who understands neither ...
15,1,20120611090207Z,"""FOR SOME REASON U SOUND RETARDED. LOL. DAMN. ...",for some reason u sound retarded lol damn wher...


In [7]:
training_data.shape

(3947, 4)

> Podemos pareciar que como se tiene definido que insulto es 1 y ausencia aparente de insulto es 0 podemos establecer el porcentaje de insultos en los datos usando un promedio.

In [8]:
mean_values = training_data['Insult'].mean()
print('El promedio de los datos es {}'.format(mean_values))
insults = training_data['Insult'].value_counts()[1]
total_tweets = training_data.shape[0]
mean_of_insults = insults / total_tweets
print('El promedio de los insultos es {}'.format(mean_values))
insults_percent = mean_of_insults * 100
print('El porcentaje de insultos es del {}%'.format(insults_percent))

El promedio de los datos es 0.2657714720040537
El promedio de los insultos es 0.2657714720040537
El porcentaje de insultos es del 26.57714720040537%


In [9]:
# CountVectorizer?

In [10]:
count_vectorizer = CountVectorizer(analyzer='word', ngram_range=(1,3), stop_words='english', max_features=50000)
count_vectorizer.fit(training_data['cleaned_comment'])

X = count_vectorizer.transform(training_data['cleaned_comment'])
y = training_data['Insult']

In [11]:
def split_data(X, y, p=.75):
    mask = np.array([bool(np.random.binomial(1, p)) for _ in range(X.shape[0])])
    
    X_train = X[mask]
    y_train = y[mask]
    X_validation = X[~mask]
    y_validation = y[~mask]
    
    return X_train, y_train, X_validation, y_validation
    
    

In [12]:
X_train, y_train, X_validation, y_validation = split_data(X, y)

In [13]:
X_train.shape

(2951, 50000)

In [14]:
from sklearn.linear_model import LogisticRegression

model = LogisticRegression()
model.fit(X_train, y_train)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [15]:
from sklearn.metrics import accuracy_score

predictions = model.predict(X_validation)
validation_score = accuracy_score(y_validation, predictions)

print('Validation score:', validation_score)

Validation score: 0.8273092369477911


In [16]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import LinearSVC
from sklearn.ensemble import RandomForestClassifier

In [17]:
class PredictionPipeline:
    
    def __init__(self, ngram_range, vectorizer_class, model_class, training_data):
        self.ngram_range=ngram_range
        self.vectorizer_class=vectorizer_class
        self.model_class=model_class
        self.training_data=training_data
        self.vectorizer = None
        self.X = None
        self.y = None
        self.model = None
        self.validation_score = None
        
    def run(self):
        self._fit_vectorizer()
        self._featurize_text()
        self._split_train_and_validation_sets()
        self._fit_model_on_training_data()
        self._validate_model_on_validation_set()
        
        print(
            """
            Vectorizer Class: {vectorizer_class}\n\
            N-gram Range: {ngram_range}\n\
            Model Class: {model_class}\n\
            Validation Score: {validation_score}
            """.format(

            vectorizer_class=repr(self.vectorizer_class.__name__), 
            ngram_range=self.ngram_range, 
            model_class=repr(self.model_class.__name__), 
            validation_score=round(self.validation_score, 4)

            )
        )

    def _fit_vectorizer(self):
        self.vectorizer = vectorizer_class(analyzer='word', ngram_range=ngram_range, 
                                     stop_words='english', max_features=50000)
        self.vectorizer.fit(self.training_data['cleaned_comment'])
    
    def _featurize_text(self):
        self.X = self.vectorizer.transform(self.training_data['cleaned_comment'])
        self.y = self.training_data['Insult']

    def _split_train_and_validation_sets(self):
        self.X_train, self.y_train, self.X_validation, self.y_validation = split_data(
            self.X, self.y)

    def _fit_model_on_training_data(self):
        self.model = self.model_class()
        self.model.fit(self.X_train, self.y_train)

    def _validate_model_on_validation_set(self):
        predictions = self.model.predict(self.X_validation)
        self.validation_score = accuracy_score(self.y_validation, predictions)      

In [18]:
results = {}

for ngram_range in [(1, 1), (1, 2), (1, 3), (1, 4)]:
    for vectorizer_class in [CountVectorizer, TfidfVectorizer]:
        for model_class in [LogisticRegression, LinearSVC, RandomForestClassifier]:
            
            # run prediction pipeline
            prediction_pipeline = PredictionPipeline(
                ngram_range=ngram_range,
                vectorizer_class=vectorizer_class,
                model_class=model_class,
                training_data=training_data
            )
            
            prediction_pipeline.run()
            
            # add hyper-parameters to `results` dictionary
            results[str(prediction_pipeline.validation_score)] = {
                    'vectorizer_class': prediction_pipeline.vectorizer_class,
                    'ngram_range': prediction_pipeline.ngram_range,
                    'model_class': prediction_pipeline.model_class
            }


            Vectorizer Class: 'CountVectorizer'
            N-gram Range: (1, 1)
            Model Class: 'LogisticRegression'
            Validation Score: 0.8258
            

            Vectorizer Class: 'CountVectorizer'
            N-gram Range: (1, 1)
            Model Class: 'LinearSVC'
            Validation Score: 0.7976
            

            Vectorizer Class: 'CountVectorizer'
            N-gram Range: (1, 1)
            Model Class: 'RandomForestClassifier'
            Validation Score: 0.826
            

            Vectorizer Class: 'TfidfVectorizer'
            N-gram Range: (1, 1)
            Model Class: 'LogisticRegression'
            Validation Score: 0.7842
            

            Vectorizer Class: 'TfidfVectorizer'
            N-gram Range: (1, 1)
            Model Class: 'LinearSVC'
            Validation Score: 0.8249
            

            Vectorizer Class: 'TfidfVectorizer'
            N-gram Range: (1, 1)
            Model Class: 'RandomForestClass

In [19]:
top_3_scores = sorted(results.keys(), reverse=True)[:3]

for score in top_3_scores:
    print('Score: {score}\nParameters: {parameters}\n'.format(
        score=score, parameters=results[score]))

Score: 0.8263157894736842
Parameters: {'vectorizer_class': <class 'sklearn.feature_extraction.text.CountVectorizer'>, 'model_class': <class 'sklearn.linear_model.logistic.LogisticRegression'>, 'ngram_range': (1, 2)}

Score: 0.8259958071278826
Parameters: {'vectorizer_class': <class 'sklearn.feature_extraction.text.CountVectorizer'>, 'model_class': <class 'sklearn.ensemble.forest.RandomForestClassifier'>, 'ngram_range': (1, 1)}

Score: 0.8258196721311475
Parameters: {'vectorizer_class': <class 'sklearn.feature_extraction.text.CountVectorizer'>, 'model_class': <class 'sklearn.linear_model.logistic.LogisticRegression'>, 'ngram_range': (1, 1)}



In [20]:
top_score_key = top_3_scores[0]

In [21]:
vectorizer_class = results[top_score_key]['vectorizer_class']
ngram_range = results[top_score_key]['ngram_range']
model_class = results[top_score_key]['model_class']

# fit vectorizer
vectorizer = vectorizer_class(analyzer='word', ngram_range=ngram_range, stop_words='english', max_features=50000)
vectorizer.fit(training_data['cleaned_comment'])

# transform text
X = vectorizer.transform(training_data['cleaned_comment'])
y = training_data['Insult']

# fit model on training data
model = model_class()
model.fit(X, y)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [None]:
while True:
    input_string = input('Please enter a string: ')
    input_string = clean_text(input_string)
    x_test = vectorizer.transform([input_string])
    
    prediction = model.predict(x_test)[0]
    print('Insult?: {}'.format( bool(prediction)))