In [1]:
# imports 

import json
import os
import re
import sqlite3
import emoji

import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

from tqdm import tqdm
from sqlite3 import Error
from datetime import datetime
from datasets import Dataset
from transformers import AutoTokenizer, pipeline
from optimum.onnxruntime import ORTModelForSequenceClassification
from transformers.pipelines.pt_utils import KeyDataset

tqdm.pandas()

# paths 
db_name = 'data/dbl.db'

# Connection to database

In [4]:
def create_connection(db_file):
    """ create a database connection to a SQLite database """
    conn = None
    try:
        
        conn = sqlite3.connect(db_file)
        print(sqlite3.version)
    except Error as e:
        print(e)
    return conn

In [5]:
try:
     
    # Making a connection between sqlite3
    # database and Python Program
    sqliteConnection = sqlite3.connect(db_name)
     
    # If sqlite3 makes a connection with python
    # program then it will print "Connected to SQLite"
    # Otherwise it will show errors
    print("Connected to SQLite")
 
    # Getting all tables from sqlite_master
    sql_query = """SELECT name FROM sqlite_master
    WHERE type='table';"""
 
    # Creating cursor object using connection object
    cursor = sqliteConnection.cursor()
     
    # executing our sql query
    cursor.execute(sql_query)
    print("List of tables\n")
     
    # printing all tables list
    print(cursor.fetchall())
 
except sqlite3.Error as error:
    print("Failed to execute the above query", error)
     
finally:
   
    # Inside Finally Block, If connection is
    # open, we need to close it
    if sqliteConnection:
         
        # using close() method, we will close
        # the connection
        sqliteConnection.close()
         
        # After closing connection object, we
        # will print "the sqlite connection is
        # closed"
        print("the sqlite connection is closed")

Connected to SQLite
List of tables

[('users',), ('tweets',), ('tweets_geo',), ('replies',), ('retweets',), ('quotes',), ('hashtags',), ('sqlite_sequence',), ('symbols',), ('user_mentions',)]
the sqlite connection is closed


# Classifing companies

In [6]:
avia_companies = {'KLM': 56377143, 'AirFrance': 106062176, 'British_Airways': 18332190, 'AmericanAir': 22536055,
                  'Lufthansa': 124476322, 'AirBerlin': 26223583,
                  'AirBerlin assist': 2182373406, 'easyJet': 38676903, 'RyanAir': 1542862735, 'SingaporeAir': 253340062,
                  'Qantas': 218730857, 'EtihadAirways': 45621423,
                  'VirginAtlantic': 20626359}

# British Airways, EtihadAirways, VirginAtlantic are private companies, so no tickers of those are available
# AirFrance and KLM had a merger at 2004 same with AirBerlin and AirBerlin assist
# AirBerlin assist is not listed
# Write a regular expression for Air Berlin, RyanAir because they have matching pattern


stock = {56377143: ['AF.PA', 'AFLYY', 'AFRAF', 'AFR.F', 'AIRF-U.TI'],  # KLM
         106062176: ['AF.PA', 'AFLYY', 'AFRAF', 'AFR.F', 'AIRF-U.TI'],  # AirFrance
         22536055: ['AAL', 'A1G.DU', 'AAL.MX', 'AAL.BA'],  # AmericanAir
         124476322: ['LHA.DE', 'DLAKY', 'LHA.F', 'DLAKF', 'LHA.SG'],  # Lufthansa
         26223583: ['AB1.F', 'AB1.HA', 'AB1.MU', 'AB1.DU', 'AB1.BE', 'AB1.HM'],  # AirBerlin
         38676903: ['EZJ.L', 'ESYJY', 'EJT1.DE', 'EJTTF', 'EJT1.HA', 'EJTS.F'],  # EasyJet
         1542862735: ['RYAAY', 'RYA.IR', 'RY4C.F', 'RY4C.DE', 'RY4C.BE', 'RY4C.DU'],  # RyanAir
         253340062: ['C6L.SI', 'SINGY', 'SINGF', 'SIA1.MU', 'SIA.MU', 'SIA1.HA']}  # SingaporeAir

In [7]:
cnx = create_connection(db_name)

2.6.0


In [8]:
# create dataframes from the database 

users = pd.read_sql_query("SELECT * FROM users", cnx)
tweets = pd.read_sql_query("SELECT * FROM tweets", cnx)
tweets_geo = pd.read_sql_query("SELECT * FROM tweets_geo", cnx)
replies = pd.read_sql_query("SELECT * FROM replies", cnx)
retweets = pd.read_sql_query("SELECT * FROM retweets", cnx)
quotes = pd.read_sql_query("SELECT * FROM quotes", cnx)
hashtags = pd.read_sql_query("SELECT * FROM hashtags", cnx)
sqlite_sequence = pd.read_sql_query("SELECT * FROM sqlite_sequence", cnx)
symbols = pd.read_sql_query("SELECT * FROM symbols", cnx)
user_mentions = pd.read_sql_query("SELECT * FROM user_mentions", cnx)

In [11]:
# convert the timestamp in datetime format and then to the string 

tweets['created_at'] = tweets['timestamp_ms'].progress_apply(lambda x: datetime.fromtimestamp(int(x/1000)))                           

100%|██████████| 6094135/6094135 [00:16<00:00, 364293.65it/s]


In [12]:
# add the hour month day and year columns 

tweets['year'] = tweets.created_at.dt.year
tweets['month'] = tweets.created_at.dt.month
tweets['day'] = tweets.created_at.dt.day
tweets['hour'] = tweets.created_at.dt.hour

# Text Cleaning

In [14]:
# get text lenghts = word count in the tweet

text_len = []
for text in tqdm(tweets.text):
    tweet_len = len(text.split())
    text_len.append(tweet_len)
tweets['text_len'] = text_len

100%|██████████| 6094135/6094135 [00:15<00:00, 390480.32it/s]


In [15]:
# define the functions for cleaning

def demojize_emoji(text):
    return emoji.demojize(text)

def remove_entities(text):
    # Getting rid off \n and \r
    text = text.replace('\r', r'').replace('\n', r' ')

    text = re.sub(r"(?:(https?|www)\://)\S+|#|:|\$|@", r" ", text)

    text = re.sub(r'[^\x00-\x7f]', r'', text)
    
    return text

def filter_chars(a):
    sent = []
    for word in a.split(' '):
        if ('$' in word) | ('&' in word):
            sent.append('')
        else:
            sent.append(word)
    return ' '.join(sent)

def remove_mult_spaces(text): ## remove multiple spaces
    return re.sub("\s\s+" , " ", text)

def remove_spam(text):
    match = re.search(r'subscribe', text)
    if match:
        return ''
    else:
        return text

In [16]:
# constructing the list with clean text 

text_new = []
for t in tqdm(tweets.text):
    text_new.append(remove_spam(remove_mult_spaces(filter_chars(remove_entities(demojize_emoji(t))))))
    
tweets['text_clean'] = text_new
tweets['text_clean'] = tweets['text_clean'].str.lower()

100%|██████████| 6094135/6094135 [08:37<00:00, 11774.34it/s]


In [17]:
# construction the list with text lenghts 

text_len = []
for text in tqdm(tweets.text_clean):
    tweet_len = len(text.split())
    text_len.append(tweet_len)
tweets['text_clean_len'] = text_len

100%|██████████| 6094135/6094135 [00:13<00:00, 438712.44it/s]


In [32]:
tweets = tweets.reset_index(drop=True)

Unnamed: 0,tweet_id,user_id,timestamp_ms,text,lang,tweet_type,created_at,year,month,day,hour,text_len,text_clean,text_clean_len
0,1131172858951024641,393374091,1558527600406,La ruta de easyJet entre Londres y Menorca tra...,es,original,2019-05-22 14:20:00,2019,5,22,14,19,la ruta de easyjet entre londres y menorca tra...,17
1,1131172864147808257,3420691215,1558527601645,@goody_tracy Here’s a list of some of @JonesDa...,en,retweet,2019-05-22 14:20:01,2019,5,22,14,38,goody_tracy heres a list of some of jonesday ...,38
2,1131172867985485824,394376606,1558527602560,@British_Airways,und,reply,2019-05-22 14:20:02,2019,5,22,14,1,british_airways,1
3,1131172909463027720,36488556,1558527612449,Nice change by @AmericanAir. Bikes now pay sta...,en,retweet,2019-05-22 14:20:12,2019,5,22,14,23,nice change by americanair. bikes now pay stan...,23
4,1131172975682605058,14193348,1558527628237,BREAKING:-\nKLM to fly 3x weekly btw @BLRAirpo...,en,retweet,2019-05-22 14:20:28,2019,5,22,14,21,breaking - klm to fly 3x weekly btw blrairport...,22


# Sentiment Analysis

In [20]:
model = ORTModelForSequenceClassification.from_pretrained("cardiffnlp/twitter-xlm-roberta-base-sentiment", export=True, provider="CUDAExecutionProvider")
tokenizer = AutoTokenizer.from_pretrained("cardiffnlp/twitter-xlm-roberta-base-sentiment")
onnx_classifier = pipeline("sentiment-analysis",model=model,tokenizer=tokenizer, device = 0, max_length=512, truncation=True)                                                          

Framework not specified. Using pt to export to ONNX.
Using framework PyTorch: 2.0.1+cu117
Overriding 1 configuration item(s)
	- use_cache -> False


verbose: False, log level: Level.ERROR



A matching Triton is not available, some optimizations will not be enabled.
Error caught was: No module named 'triton'


In [33]:
onnx_classifier(' easyjet harpercollinsch literacy_trust katiepiper_ face_vomiting face_vomiting face_vomiting face_vomiting face_vomiting face_vomiting face_vomiting face_vomiting face_vomiting face_vomiting face_vomiting face_vomiting face_vomiting face_vomiting face_vomiting face_vomiting face_vomiting nauseated_face nauseated_face nauseated_face nauseated_face nauseated_face nauseated_face nauseated_face nauseated_face nauseated_face nauseated_face nauseated_face nauseated_face nauseated_face nauseated_face pouting_cat pouting_cat pouting_cat pouting_cat pouting_cat pouting_cat pouting_cat pouting_cat pouting_cat pouting_cat pouting_cat pouting_cat pouting_cat pouting_cat confounded_face confounded_face confounded_face confounded_face confounded_face confounded_face confounded_face confounded_face tired_face tired_face tired_face tired_face tired_face tired_face tired_face tired_face anxious_face_with_sweat anxious_face_with_sweat anxious_face_with_sweat anxious_face_with_sweat anxious_face_with_sweat anxious_face_with_sweat anxious_face_with_sweat anxious_face_with_sweat angry_face_with_horns angry_face_with_horns angry_face_with_horns angry_face_with_horns angry_face_with_horns angry_face_with_horns angry_face_with_horns angry_face_with_horns grimacing_face grimacing_face grimacing_face grimacing_face grimacing_face grimacing_face grimacing_face grimacing_face frowning_face frowning_face frowning_face frowning_face face_screaming_in_fear face_screaming_in_fear face_screaming_in_fear face_screaming_in_fear fearful_face fearful_face fearful_face fearful_face fearful_face fearful_face pile_of_poo pile_of_poo pile_of_poo pile_of_poo pile_of_poo pile_of_poo pile_of_poo pile_of_poo pile_of_poo pile_of_poo pile_of_poo pile_of_poo pile_of_poo pile_of_poo pile_of_poo pile_of_poo pile_of_poo pile_of_poo pile_of_poo pile_of_poo pile_of_poo pile_of_poo pile_of_poo pile_of_poo pile_of_poo pile_of_poo pile_of_poo pile_of_poo pile_of_poo pile_of_poo pile_of_poo pile_of_poo pile_of_poo pile_of_poo pile_of_poo pile_of_poo pile_of_poo pile_of_poo pile_of_poo pile_of_poo pile_of_poo')[0]['label']
# testing if truncation of tensors work for the longest tweet

'negative'

In [45]:
slice_dict = {}
name_list = ['first', 'second', 'third', 'fourth','fifth', 'sixth', 'left']
k = 0
for i in range(0,7000001,1000001):
    j = i + 1000000
    if i == 6000006:
        slice_dict[name_list[k]] = tweets.loc[i:]
    else:
        slice_dict[name_list[k]] = tweets.loc[i:j]    
        k += 1

    

# First mln

In [79]:
# providing the sentiment analysis on the first milion tweets 

first_milion_ds = Dataset.from_pandas(slice_dict['first'])

firts_milion_list = []
for out in tqdm(onnx_classifier(KeyDataset(first_milion_ds, "text_clean"), batch_size=20)):

    firts_milion_list.append(out['label']) 

1000001it [1:26:52, 191.87it/s]                       


In [80]:
new_first_milion_ds = first_milion_ds.add_column("sentiment", firts_milion_list)

In [81]:
new_first_milion_df = new_first_milion_ds.to_pandas()


In [82]:
# convert in the csv file

new_first_milion_df.to_csv('first_mln_sentiment')

# Second mln

In [85]:
# providing the sentiment analysis for the second milion 

second_milion_ds = Dataset.from_pandas(slice_dict['second'])

second_milion_list = []
for out in tqdm(onnx_classifier(KeyDataset(second_milion_ds, "text_clean"), batch_size=20)):

    second_milion_list.append(out['label'])

1000001it [1:27:06, 191.33it/s]                       


In [87]:
new_second_milion_ds = second_milion_ds.add_column("sentiment", second_milion_list)

In [88]:
new_second_milion_df = new_second_milion_ds.to_pandas()

In [90]:
# store in the csv file 

new_second_milion_df.to_csv('second_mln_sentiment')

In [92]:
# check if all works correctly 

df = pd.read_csv('second_mln_sentiment')

# Third mln

In [105]:
third_milion_ds = Dataset.from_pandas(slice_dict['third'])

third_milion_list = []
for out in tqdm(onnx_classifier(KeyDataset(third_milion_ds, "text_clean"), batch_size=20)):

    third_milion_list.append(out['label'])

1000001it [1:29:04, 187.12it/s]                       


In [106]:
new_third_milion_ds = third_milion_ds.add_column("sentiment", third_milion_list)

In [107]:
new_third_milion_df = new_third_milion_ds.to_pandas()

In [110]:
new_third_milion_df.to_csv('third_mln_sentiment')

# Fourth mln

In [25]:
fourth_milion_ds = Dataset.from_pandas(slice_dict['fourth'])

fourth_milion_list = []
for out in tqdm(onnx_classifier(KeyDataset(fourth_milion_ds, "text_clean"), batch_size=20)):

    fourth_milion_list.append(out['label'])

889781it [1:19:58, 173.37it/s]                        

In [39]:
new_fourth_milion_ds = fourth_milion_ds.add_column("sentiment", fourth_milion_list)

In [40]:
new_fourth_milion_df = new_fourth_milion_ds.to_pandas()

In [42]:
new_fourth_milion_df.to_csv('fourth_mln_sentiment')

# Fifth mln

In [47]:
fifth_milion_ds = Dataset.from_pandas(slice_dict['fifth'])

fifth_milion_list = []
for out in tqdm(onnx_classifier(KeyDataset(fifth_milion_ds, "text_clean"), batch_size=20)):

    fifth_milion_list.append(out['label'])

251081it [22:41, 108.09it/s]                          

In [48]:
new_fith_milion_ds = fith_milion_ds.add_column("sentiment", fifth_milion_list)

In [49]:
new_fith_milion_df =  new_fourth_milion_ds.to_pandas()

In [50]:
new_fith_milion_df.to_csv('fifth_mln_sentiment')

# Sixth mln

In [51]:
sixth_milion_ds = Dataset.from_pandas(slice_dict['sixth'])

sixth_milion_list = []
for out in tqdm(onnx_classifier(KeyDataset(sixth_milion_ds, "text_clean"), batch_size=20)):

    sixth_milion_list.append(out['label'])

In [52]:
new_sixth_milion_ds = sixth_milion_ds.add_column("sentiment", sixth_milion_list)

In [56]:
new_sixth_milion_df = new_sixth_milion_ds.to_pandas()

In [57]:
new_sixth_milion_df.to_csv('sixth_mln_sentiment')

# Left

In [60]:
left_milion_ds = Dataset.from_pandas(slice_dict['left']) 

left_milion_list = []
for out in tqdm(onnx_classifier(KeyDataset(left_milion_ds, "text_clean"), batch_size=20)):

    left_milion_list.append(out['label'])

94129it [08:11, 191.57it/s]                         


In [61]:
new_left_milion_ds = left_milion_ds.add_column("sentiment", left_milion_list)

In [62]:
new_left_milion_df = new_left_milion_ds.to_pandas()

In [63]:
new_left_milion_df.to_csv('left_mln_sentiment')

# Mrging csv

In [6]:
df_mln_1 = pd.read_csv('first_mln_sentiment')
df_mln_2 = pd.read_csv('second_mln_sentiment')
df_mln_3 = pd.read_csv('third_mln_sentiment')
df_mln_4 = pd.read_csv('fourth_mln_sentiment')
df_mln_5 = pd.read_csv('fifth_mln_sentiment')
df_mln_6 = pd.read_csv('sixth_mln_sentiment')
df_mln_7 = pd.read_csv('left_mln_sentiment')

In [7]:
lst_cvs = [df_mln_1, df_mln_2, df_mln_3, df_mln_4, df_mln_5, df_mln_6, df_mln_7]

for data in lst_cvs:
    data.drop("Unnamed: 0", inplace = True, axis = 1)

In [8]:
final_df = pd.concat(lst_cvs, ignore_index=True)

In [10]:
# creating the scv with all sentiment analysis labels 

final_df.to_csv('df_sentiment')

# Testing effect of batch size


In [None]:
test_df = tweets.loc[0:9999]

test_ds = Dataset.from_pandas(test_df)

len(test_df)

In [None]:
out_lst_st_mln =[]
for text in tqdm(test_df['text_clean']):

    out_lst_st_mln.append(get_score(text)) 


In [None]:
out_lst_st_mln =[]
for out in tqdm(onnx_classifier(KeyDataset(test_ds, "text_clean"), batch_size=20)):

    out_lst_st_mln.append(out['label']) 

In [None]:
out_lst_st_mln =[]
for out in tqdm(onnx_classifier(KeyDataset(test_ds, "text_clean"), batch_size=25)):

    out_lst_st_mln.append(out['label']) 

In [None]:
out_lst_st_mln =[]
for out in tqdm(onnx_classifier(KeyDataset(test_ds, "text_clean"), batch_size=30)):

    out_lst_st_mln.append(out['label']) 

In [None]:
out_lst_st_mln =[]
for out in tqdm(onnx_classifier(KeyDataset(test_ds, "text_clean"), batch_size=15)):

    out_lst_st_mln.append(out['label']) 

In [None]:
tweets_ds_6mln = Dataset.from_pandas(tweets)
len(tweets_ds_6mln)

In [None]:
final_list = []
for out in tqdm(onnx_classifier(KeyDataset(tweets_ds_6mln, "text_clean"), batch_size=20)):

    final_list.append(out['label']) 

In [None]:
new_tweets_ds_6mln = tweets_ds_6mln.add_column("sentiment", final_list)

In [5]:
df_final =  pd.read_csv('df_sentiment')

In [33]:
df_final

Unnamed: 0.1,Unnamed: 0,tweet_id,user_id,timestamp_ms,text,lang,tweet_type,created_at,year,month,day,hour,text_len,text_clean,text_clean_len,sentiment
0,0,1131172858951024641,393374091,1558527600406,La ruta de easyJet entre Londres y Menorca tra...,es,original,2019-05-22 14:20:00,2019,5,22,14,19,la ruta de easyjet entre londres y menorca tra...,17,neutral
1,1,1131172864147808257,3420691215,1558527601645,@goody_tracy Here’s a list of some of @JonesDa...,en,retweet,2019-05-22 14:20:01,2019,5,22,14,38,goody_tracy heres a list of some of jonesday ...,38,neutral
2,2,1131172867985485824,394376606,1558527602560,@British_Airways,und,reply,2019-05-22 14:20:02,2019,5,22,14,1,british_airways,1,neutral
3,3,1131172909463027720,36488556,1558527612449,Nice change by @AmericanAir. Bikes now pay sta...,en,retweet,2019-05-22 14:20:12,2019,5,22,14,23,nice change by americanair. bikes now pay stan...,23,positive
4,4,1131172975682605058,14193348,1558527628237,BREAKING:-\nKLM to fly 3x weekly btw @BLRAirpo...,en,retweet,2019-05-22 14:20:28,2019,5,22,14,21,breaking - klm to fly 3x weekly btw blrairport...,22,neutral
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6094130,6094130,1244696703690772485,278698748,1585593794163,Me parece a mí o el avión es más grande que el...,es,retweet,2020-03-30 20:43:14,2020,3,30,20,14,me parece a m o el avin es ms grande que el si...,14,negative
6094131,6094131,1244696708983984131,246520593,1585593795425,Today’s random pic of the day is the one of Vo...,en,original,2020-03-30 20:43:15,2020,3,30,20,27,todays random pic of the day is the one of vol...,26,positive
6094132,6094132,1244696710447800320,109284383,1585593795774,@spbverhagen @markduursma @StijnBz @KLM @AirFr...,nl,retweet,2020-03-30 20:43:15,2020,3,30,20,18,spbverhagen markduursma stijnbz klm airfrance...,18,negative
6094133,6094133,1244696713350217728,1223576386432126976,1585593796466,Tweede Kamer stemt over vliegtaks https://t.co...,nl,retweet,2020-03-30 20:43:16,2020,3,30,20,44,tweede kamer stemt over vliegtaks via telegraa...,43,negative
