# **Jose Ángel Pertuz Montes**
# **Miguel Ángel Banda Del Valle**

In [None]:
%matplotlib inline

import io
import sys
import os
import re
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from nltk import TweetTokenizer
from sklearn import datasets, linear_model
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.linear_model import LogisticRegression


# Import dataset

In [None]:
raw = pd.read_csv('../input/twitter/TASS2018.csv', sep=';')
raw

# Dataset description

In [None]:
raw.info()

# Prediction class

In [None]:
tweet_by_polarity = raw.groupby("polarity", as_index=False)['content'].count()
tweet_by_polarity.head(4)

# **Gráfica**

In [None]:
plt.figure(figsize=(9,5))
sns.barplot(tweet_by_polarity['polarity'].values, tweet_by_polarity['content'].values, alpha=1)
plt.title('Polary Frequency by content')
plt.ylabel('content', fontsize=10)
plt.xlabel('polarity', fontsize=10)
plt.show()

In [None]:
def features_lexical(text):
    text_tokenizer = TweetTokenizer()
    tokens_text = text_tokenizer.tokenize(text)
    return tokens_text

# Features
- Número de caracteres por tuit
- Promedio ponderado de caracteres por tuit  (#cartares/240)
- Número menciones 
- Número emojis
- Número de palabras
- Número de caracteres especiales

In [None]:
twchar=[] #Chars per tuit
twprom=[] #Weighted average number of characters
twword=[] #Words per tuit
twats=[] #Mentions per tuit
twemoji=[] # #Emojis
twesp=[] #Especial chars
for row in raw['content']: #Identificar las respectivas características por comentario
    twchar.append(len(row))
    twprom.append(len(row)/240)
    twats.append(row.count("@")) #Tener que existe la posibilidad de que exista un "@" sin que se mencione a una persona
    a=re.findall(r'[^\w\s\@/|°!"#$%&)(=?¿¡¨*}{<>",.:+-^\']',row)
    twemoji.append(np.size(a))
    twword.append(len(features_lexical(row)))
    b=re.findall(r'[^\w\s\@]',row)
    twesp.append(np.size(b)-np.size(a))

# **Adding new features to Dataframe**

In [None]:
raw['N° chars']=twchar
raw['# char/240']=twprom
raw['# words']=twword
raw['# mentions']=twats
raw['# emojis']=twemoji
raw['# esp char']=twesp
raw

# **Giving values to data, Polarity**

In [None]:
polarity_norm=[]
for row in raw['polarity']:
    if row == 'NONE':
        polarity_norm.append(0)
    if row == 'NEU':
        polarity_norm.append(2)
    if row == 'N':
        polarity_norm.append(1)
    if row == 'P':
        polarity_norm.append(3)

# Define data (train/test) & train the model 

# For #Chars

In [None]:
# Split the data into training/testing sets
y_train = twchar[:-101] #No considering the latter 101 values
y_test = twchar[-101:] #Considering the latter 101 values 

# Split the targets into training/testing sets
X_train = polarity_norm[:-101]
X_test = polarity_norm[-101:]

X_train=np.reshape(X_train,(405,1))
X_test=np.reshape(X_test,(101,1))

# Create linear regression object

regr = linear_model.LinearRegression()

# Train the model using the training sets
regr.fit(X_train,y_train)

In [None]:
# Make predictions using the testing set
y_pred = regr.predict(X_test)

In [None]:
# The coefficients
print('Coefficients: \n', regr.coef_)
# The mean squared error
print('Mean squared error: %.2f'
      % mean_squared_error(y_test, y_pred))
# The coefficient of determination: 1 is perfect prediction
print('Coefficient of determination: %.2f'
      % r2_score(y_test, y_pred))

In [None]:
# Plot outputs
plt.scatter(X_test, y_test,  color='black')
plt.plot(X_test, y_pred, color='blue', linewidth=3)
plt.xticks(range(0,4,1))
plt.yticks((range(10,160,20)))
plt.title('Linear Regression Results')
plt.ylabel('# Chars', fontsize=10)
plt.xlabel('polarity', fontsize=10)
plt.show()

# For #Words

In [None]:
# Split the data into training/testing sets
y_train = twword[:-101] #No considering the latter 101 values
y_test = twword[-101:] #Considering the latter 101 values 

# Split the targets into training/testing sets
X_train = polarity_norm[:-101]
X_test = polarity_norm[-101:]

X_train=np.reshape(X_train,(405,1))
X_test=np.reshape(X_test,(101,1))

# Create linear regression object

regr = linear_model.LinearRegression()

# Train the model using the training sets
regr.fit(X_train,y_train)

In [None]:
# Make predictions using the testing set
y_pred = regr.predict(X_test)

# Plot outputs
plt.scatter(X_test, y_test,  color='black')
plt.plot(X_test, y_pred, color='blue', linewidth=3)
plt.xticks(range(0,4,1))
plt.yticks((range(0,35,5)))
plt.title('Linear Regression Results')
plt.ylabel('# Words', fontsize=10)
plt.xlabel('polarity', fontsize=10)
plt.show()

In [None]:
# Split the data into training/testing sets
X_train = twword[:-101] #No considering the latter 101 values
X_test = twword[-101:] #Considering the latter 101 values 

# Split the targets into training/testing sets
y = (raw['polarity_norm'] == 2).astype(np.int)
y_train = y[:-101]
y_test = y[-101:]

X_train=np.reshape(X_train,(405,1))
X_test=np.reshape(X_test,(101,1))

# Create logistic regression 
log_reg = LogisticRegression()
log_reg.fit(X_train, y_train)

In [None]:
y_prob = log_reg.predict_proba(X_test)

In [None]:
plt.plot(X_test, y_prob[:, 1], "g-", label="Iris virginica")
plt.plot(X_test, y_prob[:, 0], "b--", label="Not Iris virginica")