# Social Media Analysis : Qantitative Facts on the Twitter Usage of Scientists in Bioinformatics and Medicine

# Import packages and cleaning functions

In [17]:
# Lets import the fundamental packages
import nltk
import random
import pandas as pd
import numpy as np
from nltk.stem import PorterStemmer
import re
import string

from nltk.corpus import twitter_samples
nltk.download('twitter_samples')

from nltk.corpus import stopwords
nltk.download('stopwords')


# Lets write son function that are going to clean the sentences

# This function is very important because it is going to remove the URL and tweets have a lot
def remove_mentionUrls(text):
    tweet_out = re.sub(r'@[A-Za-z0-9]+', '', text)
    re.sub('https?://[A-Za-z0-9]+', '', tweet_out)
    return tweet_out

# This one removes all the punctuations
def remove_nonalphanumeric(text):
    text_out = ''.join([char for char in text if char not in string.punctuation])
    return text_out

# This funnction removes all the stopwords that are listed in the stopwords_list that we import juste bellow
def remove_stopwords(input_text):
    stopwords_list = stopwords.words('english')
    # Some words which might indicate a certain sentiment are kept via a whitelist
    whitelist = ["n't", "not", "no"]
    words = input_text.split() 
    clean_words = [word for word in words if (word not in stopwords_list or word in whitelist) and len(word) > 1] 
    return " ".join(clean_words)

# This function removes the grammatical part of all the verbs
def stemming(input_text):
    porter = PorterStemmer()
    words = input_text.split() 
    stemmed_words = [porter.stem(word) for word in words]
    return " ".join(stemmed_words)

# This function removes the words that are not frequent
def to_lower(input_text):
    return input_text.lower()

[nltk_data] Downloading package twitter_samples to
[nltk_data]     /Users/louismockly/nltk_data...
[nltk_data]   Package twitter_samples is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/louismockly/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


# 1. Twitter NLP Naive Bayes

In [19]:
# We first import 5000 positive and 5000 negative tweets from NLTK database
pos_tweets = twitter_samples.strings('positive_tweets.json')
neg_tweets = twitter_samples.strings('negative_tweets.json')
pos_tweets_split = [(text.split(), 'pos') for text in pos_tweets]
neg_tweets_split = [(text.split(), 'neg') for text in neg_tweets]
tot_tweets_split = pos_tweets_split + neg_tweets_split
random.shuffle(tot_tweets_split)

# We clean the data and change it into a good format
clean = [to_lower(stemming(remove_stopwords(remove_nonalphanumeric(remove_mentionUrls(text)))))
         for text in pos_tweets + neg_tweets]
words = [text.split() for text in clean]
words = [item for elem in words for item in elem]

# We extract the 2000 most frequent words from the campus
all_words = nltk.FreqDist(w.lower() for w in words)
word_features = list(all_words)[:2000]

# We create the features
def document_features(document):
    document_words = set(document)
    features = {}
    for word in word_features:
        features['contains({})'.format(word)] = (word in document_words)
    return features

# NaiveBayse Classifier
featuresets = [(document_features(d), c) for (d,c) in tot_tweets_split]
train_set, test_set = featuresets[500:], featuresets[:500]
classifier = nltk.NaiveBayesClassifier.train(train_set)

# We print the score
print(nltk.classify.accuracy(classifier, test_set))

0.67


# 2. Sentiment Analysis with TFiDF and Random Forest 

### We are going to try a second method

In [20]:
# We import TfidfVectorizer and the Random Forest Classifier
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier

# We load the Data with tweets, tweets clean and poitive and negative classification
df_data = pd.read_excel('/Users/louismockly/Documents/Twitter_Project/sentiment_analysis.xlsx')
df_data = df_data.drop(['Unnamed: 0'], 1)

# We only keep the data with a minimum len of letters to keep a sentences without noise
df_data = df_data[df_data.tweets_clean.str.len() > 7]
df_data = df_data.reset_index()
df_data = df_data.drop(['index'], 1)

# We apply the TFIDF
vectorizer = TfidfVectorizer()
X = df_data.tweets_clean
response = vectorizer.fit_transform(X)
tab = response.toarray()
df_tfidef = pd.DataFrame(tab)
df_ML = pd.concat([df_data[['class']], df_tfidef], 1)
X, y = df_ML.drop(['class'], 1).values, np.array(df_ML['class'])

# We split the data into train and test 
X_train, X_test, y_train, y_test = X[500:], X[:500], y[500:], y[:500]

# We apply a random forest 
clf = RandomForestClassifier(n_estimators=60)
clf.fit(X_train, y_train)

# We print the score
clf.score(X_test, y_test)

0.766

# 3. Bert Embedding

### We are goind to import the BERT model developped by google. The embedding is very powerfull but very low. Thus we are only goind to code the function that gives sentences embedding if you want to try it.

In [8]:
# We import Bert Model
from keras_bert import get_pretrained, PretrainedList, get_checkpoint_paths
model_path = get_pretrained(PretrainedList.multi_cased_base)
paths = get_checkpoint_paths(model_path)
print(paths.config, paths.checkpoint, paths.vocab)

Using TensorFlow backend.


Downloading data from https://storage.googleapis.com/bert_models/2018_11_23/multi_cased_L-12_H-768_A-12.zip
/Users/louismockly/.keras/datasets/multi_cased_L-12_H-768_A-12/bert_config.json /Users/louismockly/.keras/datasets/multi_cased_L-12_H-768_A-12/bert_model.ckpt /Users/louismockly/.keras/datasets/multi_cased_L-12_H-768_A-12/vocab.txt


In [77]:
from keras_bert import extract_embeddings
# We load the Data with tweets, tweets clean and poitive and negative classification
df_data = pd.read_excel('/Users/louismockly/Documents/Twitter_Project/sentiment_analysis.xlsx')
df_data = df_data.drop(['Unnamed: 0'], 1)

# We only keep the data with a minimum len of letters
df_data = df_data[df_data.tweets_clean.str.len() > 7]
df_data = df_data.reset_index()
df_data = df_data.drop(['index'], 1)

# Here, it gives the embedding of the sentences you want
model_path = '/Users/louismockly/.keras/datasets/multi_cased_L-12_H-768_A-12'
texts = df_data.tweets_clean
embeddings = extract_embeddings(model_path, texts)

# 4. Pro Perso with TFiDF and Random Forest 

### We are going to try the TF-IDF and a Random Forest Classifier to the Pro-Perso data 

In [21]:
# We import the data 
df_pro_perso = pd.read_excel('/Users/louismockly/Documents/Twitter_Project/pro_perso.xlsx')
df_pro_perso = df_pro_perso.drop(['Unnamed: 0'], 1)
df_pro_perso.columns = ['tweet', 'pro_perso']
df_pro_perso = df_pro_perso.dropna()
df_pro_perso = df_pro_perso.reset_index()
df_pro_perso = df_pro_perso.drop(['index'], 1)

# We Clean the data
tweet_pro_perso_clean = [to_lower(stemming(remove_stopwords(remove_nonalphanumeric(remove_mentionUrls(text))))) \
         for text in df_pro_perso.tweet]
df_tweet_clean = pd.DataFrame(tweet_pro_perso_clean, columns = ['tweet_clean'])
df_pro_perso_clean = pd.concat([df_tweet_clean, df_pro_perso], 1)
df_pro_perso_clean = df_pro_perso_clean.replace('Pro', 1)
df_pro_perso_clean = df_pro_perso_clean.replace('Perso', 0)

# We vectorize it
vectorizer = TfidfVectorizer()
X = df_pro_perso_clean.tweet_clean
response = vectorizer.fit_transform(X)
tab = response.toarray()
df_tfidef = pd.DataFrame(tab)
df_ML = pd.concat([df_pro_perso_clean[['pro_perso']], df_tfidef], 1)
X, y = df_ML.drop(['pro_perso'], 1).values, np.array(df_ML['pro_perso'])

# We split intot train and test set
X_train, X_test, y_train, y_test = X[50:], X[:50], y[50:], y[:50]

# Random Forest Classifier
clf = RandomForestClassifier(n_estimators=100)
clf.fit(X_train, y_train)

clf.score(X_test, y_test)

0.54

# 5. CNN the Best 

## 5.1 Sentiment Analysis

In [23]:
# We load the Data with tweets, tweets clean and poitive and negative classification
df_data = pd.read_excel('/Users/louismockly/Documents/Twitter_Project/sentiment_analysis.xlsx')
df_data = df_data.drop(['Unnamed: 0'], 1)

# We only keep the data with a minimum len of letters
df_data = df_data[df_data.tweets_clean.str.len() > 7]
df_data = df_data.reset_index()
df_data = df_data.drop(['index'], 1)

df_data = df_data.drop(['tweets_clean'], 1)
from sklearn.model_selection import train_test_split

SEED = 2000

# We split the data
x_train, x_validation, y_train, y_validation = \
                train_test_split(df_data.tweets, df_data['class'], test_size=.15, random_state=SEED)

from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences

tokenizer = Tokenizer(num_words=100000)
tokenizer.fit_on_texts(x_train)
sequences = tokenizer.texts_to_sequences(x_train)

length = []
for x in x_train:
    length.append(len(x.split()))
max(length)

x_train_seq = pad_sequences(sequences, maxlen=max(length) + 5)

sequences_val = tokenizer.texts_to_sequences(x_validation)
x_val_seq = pad_sequences(sequences_val, maxlen=max(length) + 5)


#CNN
from keras.models import Sequential
from keras.layers import Dense, Conv1D, GlobalMaxPooling1D
from keras.layers.embeddings import Embedding
from time import time

acc = []
times = []

model_cnn = Sequential()

# Embedding and import the model
e = Embedding(100000, 100, input_length=max(length) + 5)
model_cnn.add(e)
model_cnn.add(Conv1D(filters=100, kernel_size=2, padding='valid', activation='relu', strides=1))
model_cnn.add(GlobalMaxPooling1D())
model_cnn.add(Dense(256, activation='relu'))
model_cnn.add(Dense(1, activation='sigmoid'))
model_cnn.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

# We print the score and accuracy of the model at each step
t0 = time()
model_cnn.fit(x_train_seq, y_train, validation_data=(x_val_seq, y_validation), epochs=5, batch_size=32, verbose=2)
score,accu = model_cnn.evaluate(x_val_seq, y_validation, verbose = 2, batch_size = 32)
tv_time = time()-t0

acc.append(accu*100)
times.append(tv_time*0.0166667)

print("score: %.2f" % (score))
print("acc: %.2f" % (accu))

  "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "


Train on 7893 samples, validate on 1393 samples
Epoch 1/5
 - 51s - loss: 0.5156 - accuracy: 0.7322 - val_loss: 0.4600 - val_accuracy: 0.7861
Epoch 2/5
 - 46s - loss: 0.2244 - accuracy: 0.9117 - val_loss: 0.4902 - val_accuracy: 0.7918
Epoch 3/5
 - 44s - loss: 0.0427 - accuracy: 0.9867 - val_loss: 0.6879 - val_accuracy: 0.7782
Epoch 4/5
 - 44s - loss: 0.0074 - accuracy: 0.9981 - val_loss: 0.7929 - val_accuracy: 0.7846
Epoch 5/5
 - 45s - loss: 0.0015 - accuracy: 0.9999 - val_loss: 0.9253 - val_accuracy: 0.7803
score: 0.93
acc: 0.78


## 5.2 Pro VS Perso

In [87]:
# We import the data
df_pro_perso = pd.read_excel('/Users/louismockly/Documents/Twitter_Project/pro_perso.xlsx')
df_pro_perso = df_pro_perso.drop(['Unnamed: 0'], 1)
df_pro_perso.columns = ['tweet', 'pro_perso']
df_pro_perso = df_pro_perso.dropna()
df_pro_perso = df_pro_perso.reset_index()
df_pro_perso = df_pro_perso.drop(['index'], 1)
df_pro_perso = df_pro_perso_clean.replace('Pro', 1)
df_pro_perso = df_pro_perso_clean.replace('Perso', 0)

df_data = df_pro_perso
from sklearn.model_selection import train_test_split

SEED = 2000

# We split the data
x_train, x_validation, y_train, y_validation = \
                train_test_split(df_data.tweet, df_data['pro_perso'], test_size=.10, random_state=SEED)


from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences

tokenizer = Tokenizer(num_words=100000)
tokenizer.fit_on_texts(x_train)
sequences = tokenizer.texts_to_sequences(x_train)

length = []
for x in x_train:
    length.append(len(x.split()))
max(length)

x_train_seq = pad_sequences(sequences, maxlen=max(length) + 5)

sequences_val = tokenizer.texts_to_sequences(x_validation)
x_val_seq = pad_sequences(sequences_val, maxlen=max(length) + 5)


#CNN
from keras.models import Sequential
from keras.layers import Dense, Conv1D, GlobalMaxPooling1D
from keras.layers.embeddings import Embedding
from time import time

acc = []
times = []

model_cnn = Sequential()

# Embedding and import the model
e = Embedding(100000, 100, input_length=max(length) + 5)
model_cnn.add(e)
model_cnn.add(Conv1D(filters=100, kernel_size=2, padding='valid', activation='relu', strides=1))
model_cnn.add(GlobalMaxPooling1D())
model_cnn.add(Dense(256, activation='relu'))
model_cnn.add(Dense(1, activation='sigmoid'))
model_cnn.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

# We print the score and accuracy of the model at each step
t0 = time()
model_cnn.fit(x_train_seq, y_train, validation_data=(x_val_seq, y_validation), epochs=5, batch_size=32, verbose=2)
score,accu = model_cnn.evaluate(x_val_seq, y_validation, verbose = 2, batch_size = 32)
tv_time = time()-t0

acc.append(accu*100)
times.append(tv_time*0.0166667)

print("score: %.2f" % (score))
print("acc: %.2f" % (accu))

  "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "


Train on 433 samples, validate on 49 samples
Epoch 1/5
 - 3s - loss: 0.6707 - accuracy: 0.5912 - val_loss: 0.6738 - val_accuracy: 0.5510
Epoch 2/5
 - 2s - loss: 0.6247 - accuracy: 0.5958 - val_loss: 0.6412 - val_accuracy: 0.5510
Epoch 3/5
 - 2s - loss: 0.5267 - accuracy: 0.6859 - val_loss: 0.5572 - val_accuracy: 0.6735
Epoch 4/5
 - 2s - loss: 0.3240 - accuracy: 0.9376 - val_loss: 0.4264 - val_accuracy: 0.8571
Epoch 5/5
 - 2s - loss: 0.1116 - accuracy: 0.9954 - val_loss: 0.3802 - val_accuracy: 0.8980
score: 0.38
acc: 0.90
