In [74]:
%env GOOGLE_APPLICATION_CREDENTIALS=C:\key.json

env: GOOGLE_APPLICATION_CREDENTIALS=C:\key.json


In [75]:
import pandas as pd
from pycontractions  import Contractions
from transformers import AutoModelForSequenceClassification, AutoTokenizer, pipeline
from flair.models import SequenceTagger
from flair.data import Sentence
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
from google.cloud import translate_v2 as translate
from keyphrase_vectorizers import KeyphraseCountVectorizer
from keybert import KeyBERT
# import chardet

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\baira\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [76]:
# Google client object
client = translate.Client()

# Read Tweets Data

In [77]:
# Read the data --> encoding utf-8, some characters were not coming correctly which utf-8 solved the issue
df = pd.read_csv('./data/cleandata.csv', parse_dates=['Date'],encoding = "utf-8") 
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2668 entries, 0 to 2667
Data columns (total 5 columns):
 #   Column          Non-Null Count  Dtype         
---  ------          --------------  -----         
 0   Tweets          2668 non-null   object        
 1   Retweets        2668 non-null   int64         
 2   Likes           2668 non-null   int64         
 3   Date            2668 non-null   datetime64[ns]
 4   Cleaned_Tweets  2668 non-null   object        
dtypes: datetime64[ns](1), int64(2), object(2)
memory usage: 104.3+ KB


In [78]:
def get_basic_info(dataframe):
    cols_list = dataframe.columns.tolist()
    data_types = df.dtypes    
    
    for col in cols_list:        
        # get the number of unique entries for each column
        no_unique_value = dataframe[col].nunique()
        print(f'The number of unique values for column {col} is {no_unique_value}')

        # Check for the data type and get the min and max value
        if data_types[col] != 'object':
            min_value = dataframe[col].min()
            max_value = dataframe[col].max()
            print(f'The min value for column {col} is {min_value}')
            print(f'The max value for column {col} is {max_value}\n')
        else:
            print('\n')  
get_basic_info(df)     

The number of unique values for column Tweets is 2642


The number of unique values for column Retweets is 1834
The min value for column Retweets is 41
The max value for column Retweets is 681707

The number of unique values for column Likes is 2598
The min value for column Likes is 933
The max value for column Likes is 4780787

The number of unique values for column Date is 2668
The min value for column Date is 2022-01-27 21:00:09
The max value for column Date is 2022-10-27 16:17:39

The number of unique values for column Cleaned_Tweets is 2382




# Data Cleaning

In [79]:
cont = Contractions('./ignore/GoogleNews-vectors-negative300.bin')
cont.load_models()

In [80]:
import re
# Replace the non-ASCII characters
df['Cleaned_Tweets'] = df['Cleaned_Tweets'].replace({r'[^\x00-\x7F]+':''}, regex=True)

# stopwords list
stop = stopwords.words('english')
stop += ['im','ie','ete', 'dont', 'cant', 'would','wont','doesnt','must','might','also','almost','so', 'haha']

def clean(text):
    # Remove spaces from beginning and ending of the text
    text = text.strip()
    # Fix quotes
    text = text.replace("’", "'") \
        .replace("‘", "'") \
        .replace("”", '"') \
        .replace("“", '"')

    # Replace &amp; with and
    text = text.replace('&amp;','and')

    text = text.replace('&gt;', 'greater than ')

    text = text.replace('&lt;', 'less than ')

    text = text.replace('-', '')
    
    # Fix contractions
    text = list(cont.expand_texts([text], precise=True))[0]
   
    #val = re.fullmatch(r'[\w\s]*((?<=\d)[\.,\/:%]|(?=\d))*[\w\s]*',text, flags=re.MULTILINE)
    # Replace only the punctuations from the text --> do not remove decimals or , or % from digits
    text = re.sub(r'[^\w\s%](?!\d)', ' ', text, flags=re.MULTILINE)  

    return text

df['Cleaned_Tweets'] = df['Cleaned_Tweets'].apply(clean)

In [81]:
df.shape

(2668, 5)

In [82]:
# Count the tokens of the tweets
df['Token_Counts'] = df['Cleaned_Tweets'].apply(lambda x: len(x.split(' ')))
# Remove tweets with less than 3 tokens
df = df[df['Token_Counts'] > 3].reset_index(drop=True).copy()
df.shape

(1996, 6)

In [83]:
df['tweet_language'] = df['Cleaned_Tweets'].apply(lambda x : client.detect_language(x)['language'])

In [84]:
# Translate the text with different language to english --> tweets that were not recognized in english will be translated to english
df_temp = df[df['tweet_language'] != 'en'].copy()
df_temp['Cleaned_Tweets'] = df_temp['Cleaned_Tweets'].apply(lambda x : client.translate(x, target_language='en')['translatedText'])
# Replace the punctuations in the translated text
df_temp['Cleaned_Tweets'] = df_temp['Cleaned_Tweets'].replace({r'[^\w\s]':''}, regex=True)
df_temp.head(10)

Unnamed: 0,Tweets,Retweets,Likes,Date,Cleaned_Tweets,Token_Counts,tweet_language
67,Vox Populi Vox Dei,5709,53880,2022-10-19 16:59:23,Voice of the people voice of God,4,la
353,"Baltasar Gracián, Oráculo Manual y Arte de Pru...",2903,36061,2022-09-19 19:33:52,Baltasar Gracin Oracle Manual and Art of Prudence,9,es
534,Standup is my side-hustle,8866,155973,2022-08-17 04:59:51,Standup is my side hustle,4,da
668,Schadenfreude oder Schatzifreude?,1227,29721,2022-07-24 03:54:01,Schadenfreude or Schatzifreude,4,de
751,@BillyM2k 🐁 + ⌨️ v1.05,290,5508,2022-07-11 21:25:28,v105,4,ku
798,A veritable “sock aficionado”,2974,56477,2022-07-05 17:44:09,A real sock amateur,4,es
810,Happy July 4th! https://t.co/KYN2XO712Z,16091,289550,2022-07-04 20:14:41,Happy July 4th,4,te-Latn
1075,Con te … partirò,5404,88159,2022-05-29 20:38:24,I39ll leave with you,4,es
1466,Per aspera ad astra!,41296,518423,2022-04-26 20:15:32,Through difficulties to the stars,5,la
1471,🚀💫♥️ Yesss!!! ♥️💫🚀 https://t.co/0T9HzUHuh6,348158,2608578,2022-04-25 19:43:22,Yessss,4,ms


In [85]:
# get the index of the df_temp and update df at the same index for the cleaned tweets
df.loc[df_temp.index, 'Cleaned_Tweets'] = df_temp['Cleaned_Tweets']

# Verify if data got updated
df.loc[df_temp.index, 'Cleaned_Tweets']

67                       Voice of the people voice of God
353     Baltasar Gracin Oracle Manual and Art of Prudence
534                             Standup is my side hustle
668                        Schadenfreude or Schatzifreude
751                                                  v105
798                                   A real sock amateur
810                                        Happy July 4th
1075                                 I39ll leave with you
1466                    Through difficulties to the stars
1471                                               Yessss
1688                                    Thank you Germany
1739                                  I love the mariachi
1782    I would like to thank you very much The future...
Name: Cleaned_Tweets, dtype: object

In [86]:
df.loc[1075,'Cleaned_Tweets'] = 'I will leave with you'

In [87]:
# Because of the different language the tokens were not calculated properly
# Count the tokens of the tweets
df['Token_Counts'] = df['Cleaned_Tweets'].apply(lambda x: len(x.split(' ')))
# Remove tweets with less than 3 tokens
df = df[df['Token_Counts'] > 3].reset_index(drop=True).copy()
df.shape

(1991, 7)

In [88]:
df.to_csv('./ignore/data_after_translation.csv',encoding = "utf-8")

In [89]:
# Add extra list into stop words identified from tweets
stop += ["cb", "twh", "og", "onto", "tf", "oge", "fyi", "v", "um", "lb", "g", "bros", "cc", "mgmt", "vw", "aka",
         "tsla", "n", "%", "f","thy", "thee", "bi", "r", "mr", "vu", "dj", "ci", "h", "con", "bf", "lmk", "incl",
         "uh", "ii", "tbc", "mf", "ye", "ya", "eg", "hi", "wow", "v2", "bro", "went", "oh", "bs", "none", "das",
         "guy", "yup", "took", "saw", "obv", "got", "un", "kind", "gave", "gone", "id", "btw", "thank", "due",
         "tbh", "c", "keep", "able", "around", "vs", "per", "yet", "imo", "x", "soon", "away", "coming", "sure",
         "take", "go", "ok", "please", "something", "going", "making", "cannot", "want", "done", "let", "use", "say",
         "made", "see", "back", "possible", "could", "us", "need", "yeah", "know", "get", "yes", "right", "still",
         "think", "make", "like", "many", "much", "people"]
tmp_stop = []

for val in stop:
    tmp_stop.append(val.capitalize())
    tmp_stop.append(val.upper())

stop += tmp_stop

In [90]:
# Remove the stop words
def process_text(text):    
    text = " ".join([word for word in text.split() if word not in stop])
    return text

df['Cleaned_Tweets'] = df['Cleaned_Tweets'].apply(process_text)

In [91]:
# Count the tokens of the tweets
df['Token_Counts'] = df['Cleaned_Tweets'].apply(lambda x: len(x.split(' ')))
# Remove tweets with less than 3 tokens
df = df[df['Token_Counts'] > 3].reset_index(drop=True).copy()
df.shape

(1453, 7)

# Emotions Classification

In [92]:
# Instantiate model pipeline
model = AutoModelForSequenceClassification.from_pretrained(
    "Emanuel/bertweet-emotion-base"
)
tokenizer = AutoTokenizer.from_pretrained(
    "Emanuel/bertweet-emotion-base"
)
device = -1 #torch.cuda.current_device() if torch.cuda.is_available else -1
model_pipeline = pipeline(
    "text-classification",
    model=model,
    tokenizer=tokenizer,
    device=device
)

In [93]:
df['Emotion_Scores'] = model_pipeline(df['Cleaned_Tweets'].to_list(), top_k=None)
df['Emotion_Scores'].head()

0    [{'label': 'joy', 'score': 0.9908571839332581}...
1    [{'label': 'joy', 'score': 0.6109302639961243}...
2    [{'label': 'sadness', 'score': 0.4522185921669...
3    [{'label': 'joy', 'score': 0.696447491645813},...
4    [{'label': 'anger', 'score': 0.632644712924957...
Name: Emotion_Scores, dtype: object

In [94]:
# Assign top 1 emotion
df['Emotion1'] = df['Emotion_Scores'].apply(lambda x: x[0]['label'])
df['Emotion1'].head()

0        joy
1        joy
2    sadness
3        joy
4      anger
Name: Emotion1, dtype: object

# Topic Extraction

In [95]:
kw_model = KeyBERT()
# Extract the noun keyphrases, so create the vectorizer with the pattern and pass it to KeyBert
vectorizer = KeyphraseCountVectorizer(pos_pattern='<NNP.*>+')
df['Noun_Keyphrases_Score'] = kw_model.extract_keywords(docs=df['Cleaned_Tweets'].to_list(),\
                         vectorizer=vectorizer, stop_words='english', top_n=5)
df['Noun_Keyphrases'] = df['Noun_Keyphrases_Score'].apply(lambda record: [x[0] for x in record])

In [96]:
df['Noun_Keyphrases_Score'].loc[0]

[('twitter', 0.6036), ('lot', 0.3114), ('cool', 0.293)]

In [97]:
df['Noun_Keyphrases']

0        [twitter, lot, cool]
1       [twitter hq, twitter]
2                   [twitter]
3            [twitter, thing]
4                       [fan]
                ...          
1448                   [time]
1449              [manganese]
1450              [manganese]
1451                       []
1452                       []
Name: Noun_Keyphrases, Length: 1453, dtype: object

In [98]:
# Converting score types for serialization
def fix_float_type(input):
    return [(x[0], str(x[1])) for x in input]
df['Noun_Keyphrases_Score'] = df['Noun_Keyphrases_Score'].apply(fix_float_type)

In [99]:
# load tagger
tagger = SequenceTagger.load("flair/pos-english")

def flair_pos_tagging(sentence):
    # print(sentence)
    verbs = set()
    adjectives = set()
    sen = Sentence(sentence)
    tagger.predict(sen)

    for label in sen.get_labels('pos'):
        
        if label.value[0:2] == 'VB' and label.score > 0.75:
            verbs.add(label.data_point.text)
            # print(verbs)
        if label.value[0:2] == 'JJ' and label.score > 0.75:
            adjectives.add(label.data_point.text)
            # print(adjectives)

    return list(verbs), list(adjectives)

df['verbs'], df['adjectives'] = zip(*df['Cleaned_Tweets'].str.lower()\
                                            .apply(flair_pos_tagging))

2022-12-09 18:15:14,044 loading file C:\Users\baira\.flair\models\pos-english\a9a73f6cd878edce8a0fa518db76f441f1cc49c2525b2b4557af278ec2f0659e.121306ea62993d04cd1978398b68396931a39eb47754c8a06a87f325ea70ac63
2022-12-09 18:15:14,855 SequenceTagger predicts: Dictionary with 53 tags: <unk>, O, UH, ,, VBD, PRP, VB, PRP$, NN, RB, ., DT, JJ, VBP, VBG, IN, CD, NNS, NNP, WRB, VBZ, WDT, CC, TO, MD, VBN, WP, :, RP, EX, JJR, FW, XX, HYPH, POS, RBR, JJS, PDT, NNPS, RBS, AFX, WP$, -LRB-, -RRB-, ``, '', LS, $, SYM, ADD


In [None]:
verbs_to_remove = ['get','are','is','am','have','has','been','seen','had','do','took','be',
                    'make','does','like','did','see','was','go','got','get','want','getting','gets', 'exist',
                    'done','doing','went','uses','says','known','let','given' ,'gave','makes','goes',
                    'gone','going','saw','being','were']

def remove_words(row):
    verbs_list = []

    if len(row) > 0:
        for i in row:
            if i not in verbs_to_remove:
                verbs_list.append(i)
    return verbs_list                  

df['verbs'] = df['verbs'].apply(remove_words)

In [None]:
adj_to_remove = ['many','most','much','such']

def remove_words(row):
    adj_list = []

    if len(row) > 0:
        for i in row:
            if i not in adj_to_remove:
                adj_list.append(i)
    return adj_list                  

df['adjectives'] = df['adjectives'].apply(remove_words)

In [None]:
df.to_parquet('./data/processed_data.parquet', index=False)

#### Rough Work --> code that can be used in future

In [None]:
# Remove punctuations
# Adding spaces after the removal of punctuations, as there might not be space 
# added in the text after the use of punctuation
# no_spaces = len(string.punctuation)
    
# text = text.translate(str.maketrans(string.punctuation,' ' * no_spaces))  

# # Remove stopwords
# text = " ".join([word for word in text.split() if word not in stop])

# with open("./data/cleandata.csv", 'rb') as rawdata:
#     result = chardet.detect(rawdata.read())

# # check what the character encoding might be
# print(result)