In [2]:
import pandas as pd
from sklearn.feature_extraction import text
import nltk
import numpy as np
from nltk.stem import WordNetLemmatizer
from sklearn.preprocessing import OrdinalEncoder
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD
from sklearn.pipeline import make_pipeline
import stop_words as sw
from nltk.tokenize import word_tokenize 
from sklearn.preprocessing import Normalizer

## Data preprocessing

### Read data

In [3]:
df_pl_tweets = pd.read_csv("data/pl_scraped_tweets2.csv", delimiter=',')
df_pl_tweets.head(2)

Unnamed: 0.1,Unnamed: 0,id,date,media,lang,content,hashtags,likeCount,replyCount,quoteCount,user_name,user_location,user_isprotected,user_isverified
0,0,1590493636424503296,2022-11-09 23:56:47+00:00,,pl,Patrząc po dzisiejszym twitterze brak powołani...,,82,0,0,ElKova11,"Gdańsk, Polska",False,False
1,1,1590493490454364160,2022-11-09 23:56:12+00:00,,pl,awieeee https://t.co/2g9JCV08Fq,,1,0,0,ms_graceyyy,,False,False


In [4]:
df_pl_tweets.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20000 entries, 0 to 19999
Data columns (total 14 columns):
 #   Column            Non-Null Count  Dtype 
---  ------            --------------  ----- 
 0   Unnamed: 0        20000 non-null  int64 
 1   id                20000 non-null  int64 
 2   date              20000 non-null  object
 3   media             1983 non-null   object
 4   lang              20000 non-null  object
 5   content           20000 non-null  object
 6   hashtags          1267 non-null   object
 7   likeCount         20000 non-null  int64 
 8   replyCount        20000 non-null  int64 
 9   quoteCount        20000 non-null  int64 
 10  user_name         20000 non-null  object
 11  user_location     11381 non-null  object
 12  user_isprotected  20000 non-null  bool  
 13  user_isverified   20000 non-null  bool  
dtypes: bool(2), int64(5), object(7)
memory usage: 1.9+ MB


In [5]:
df_en_tweets = pd.read_csv("data/en_scraped_tweets2.csv", delimiter=',')
df_en_tweets.head(2)

Unnamed: 0.1,Unnamed: 0,id,date,media,lang,content,hashtags,likeCount,replyCount,quoteCount,user_name,user_location,user_isprotected,user_isverified
0,0,1586870117631086592,2022-10-30 23:58:13+00:00,,en,@Futuball_io Very good project \n@ronibd_raj \...,"['Futuball', 'FB', 'WorldCup2022', 'BNB']",0,0,0,JibonMi34339450,,False,False
1,1,1586869850672140289,2022-10-30 23:57:09+00:00,,es,@fifaworldcup_es @Nacional gracias firme,,3,0,0,Totoa1899,República de Villa Española,False,False


In [6]:
df_en_tweets.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20000 entries, 0 to 19999
Data columns (total 14 columns):
 #   Column            Non-Null Count  Dtype 
---  ------            --------------  ----- 
 0   Unnamed: 0        20000 non-null  int64 
 1   id                20000 non-null  int64 
 2   date              20000 non-null  object
 3   media             5138 non-null   object
 4   lang              20000 non-null  object
 5   content           20000 non-null  object
 6   hashtags          12170 non-null  object
 7   likeCount         20000 non-null  int64 
 8   replyCount        20000 non-null  int64 
 9   quoteCount        20000 non-null  int64 
 10  user_name         20000 non-null  object
 11  user_location     12681 non-null  object
 12  user_isprotected  20000 non-null  bool  
 13  user_isverified   20000 non-null  bool  
dtypes: bool(2), int64(5), object(7)
memory usage: 1.9+ MB


### clean columns

#### Ensure that pl tweets are just in polish and en are just in english

In [7]:
df_pl_tweets = df_pl_tweets.loc[df_pl_tweets['lang'] == 'pl']
df_pl_tweets.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 19998 entries, 0 to 19999
Data columns (total 14 columns):
 #   Column            Non-Null Count  Dtype 
---  ------            --------------  ----- 
 0   Unnamed: 0        19998 non-null  int64 
 1   id                19998 non-null  int64 
 2   date              19998 non-null  object
 3   media             1983 non-null   object
 4   lang              19998 non-null  object
 5   content           19998 non-null  object
 6   hashtags          1266 non-null   object
 7   likeCount         19998 non-null  int64 
 8   replyCount        19998 non-null  int64 
 9   quoteCount        19998 non-null  int64 
 10  user_name         19998 non-null  object
 11  user_location     11380 non-null  object
 12  user_isprotected  19998 non-null  bool  
 13  user_isverified   19998 non-null  bool  
dtypes: bool(2), int64(5), object(7)
memory usage: 2.0+ MB


In [8]:
df_en_tweets = df_en_tweets.loc[df_en_tweets['lang'] == 'en']
df_en_tweets.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 19867 entries, 0 to 19999
Data columns (total 14 columns):
 #   Column            Non-Null Count  Dtype 
---  ------            --------------  ----- 
 0   Unnamed: 0        19867 non-null  int64 
 1   id                19867 non-null  int64 
 2   date              19867 non-null  object
 3   media             5098 non-null   object
 4   lang              19867 non-null  object
 5   content           19867 non-null  object
 6   hashtags          12121 non-null  object
 7   likeCount         19867 non-null  int64 
 8   replyCount        19867 non-null  int64 
 9   quoteCount        19867 non-null  int64 
 10  user_name         19867 non-null  object
 11  user_location     12602 non-null  object
 12  user_isprotected  19867 non-null  bool  
 13  user_isverified   19867 non-null  bool  
dtypes: bool(2), int64(5), object(7)
memory usage: 2.0+ MB


In [9]:
df_list = [df_pl_tweets, df_en_tweets]

##### add new, simpler index

In [10]:
# create new index
for df in df_list:
    df.insert(0, 'ID', range(0, len(df)))
    df = df.set_index('ID')

In [11]:
df_en_tweets.head(3)

Unnamed: 0.1,ID,Unnamed: 0,id,date,media,lang,content,hashtags,likeCount,replyCount,quoteCount,user_name,user_location,user_isprotected,user_isverified
0,0,0,1586870117631086592,2022-10-30 23:58:13+00:00,,en,@Futuball_io Very good project \n@ronibd_raj \...,"['Futuball', 'FB', 'WorldCup2022', 'BNB']",0,0,0,JibonMi34339450,,False,False
2,1,2,1586869642605125632,2022-10-30 23:56:20+00:00,,en,@ikkanomics Scnrio. Worldcup Ind Vs SA:Cricket...,,0,0,0,dbdon4,,False,False
3,2,3,1586869559524577281,2022-10-30 23:56:00+00:00,,en,@OGDfarmer In the meantime @flufworld is build...,,2,0,0,iAmKaolo,,False,False


In [12]:
df_pl_tweets = df_pl_tweets[['ID', 'lang', 'content', 'user_location']]
df_pl_tweets.head(3)

Unnamed: 0,ID,lang,content,user_location
0,0,pl,Patrząc po dzisiejszym twitterze brak powołani...,"Gdańsk, Polska"
1,1,pl,awieeee https://t.co/2g9JCV08Fq,
2,2,pl,"-Brak większej liczby reprezentacji, choćby ty...","Szczecin, Polska"


In [13]:
df_en_tweets = df_en_tweets[['ID', 'lang', 'content', 'user_location']]
df_en_tweets.head(3)

Unnamed: 0,ID,lang,content,user_location
0,0,en,@Futuball_io Very good project \n@ronibd_raj \...,
2,1,en,@ikkanomics Scnrio. Worldcup Ind Vs SA:Cricket...,
3,2,en,@OGDfarmer In the meantime @flufworld is build...,


#### Encode location column

In [14]:
oe = OrdinalEncoder()

df_list = [df_pl_tweets, df_en_tweets]

for df in df_list:
    df['location'] = df['user_location']
    df['location'] = oe.fit_transform(np.array(df['location']).reshape(-1, 1))

In [15]:
df_en_tweets = df_en_tweets.drop(columns = ['user_location'])
df_en_tweets.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 19867 entries, 0 to 19999
Data columns (total 4 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   ID        19867 non-null  int64  
 1   lang      19867 non-null  object 
 2   content   19867 non-null  object 
 3   location  12602 non-null  float64
dtypes: float64(1), int64(1), object(2)
memory usage: 776.1+ KB


In [16]:
df_pl_tweets = df_pl_tweets.drop(columns = ['user_location'])
df_pl_tweets.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 19998 entries, 0 to 19999
Data columns (total 4 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   ID        19998 non-null  int64  
 1   lang      19998 non-null  object 
 2   content   19998 non-null  object 
 3   location  11380 non-null  float64
dtypes: float64(1), int64(1), object(2)
memory usage: 781.2+ KB


In [17]:
df_pl_tweets.head(2)

Unnamed: 0,ID,lang,content,location
0,0,pl,Patrząc po dzisiejszym twitterze brak powołani...,462.0
1,1,pl,awieeee https://t.co/2g9JCV08Fq,


In [18]:
df_en_tweets.head(2)

Unnamed: 0,ID,lang,content,location
0,0,en,@Futuball_io Very good project \n@ronibd_raj \...,
2,1,en,@ikkanomics Scnrio. Worldcup Ind Vs SA:Cricket...,


### Clean text content

#### clean special characters and whitespaces

In [19]:
df_list =  [df_pl_tweets, df_en_tweets]

white_spaces = ["\r", "\n","\t", "    "]
punctuation_signs = list("?:!.,;")
special_characters = list("@#")

for df in df_list:

    df['content_clean_1'] = df['content']

    # whitespaces
    for char in white_spaces:
        df['content_clean_1'] = df['content_clean_1'].str.replace(char, " ")

    # quotation marks
    df['content_clean_1'] = df['content_clean_1'].str.replace('"', '')


    # puncuation signs
    for sign in punctuation_signs:
        df['content_clean_1'] = df['content_clean_1'].str.replace(sign, '')

  df['content_clean_1'] = df['content_clean_1'].str.replace(sign, '')
  df['content_clean_1'] = df['content_clean_1'].str.replace(sign, '')


In [20]:
df_pl_tweets['content_clean_1'].head(3)

0    Patrząc po dzisiejszym twitterze brak powołani...
1                        awieeee https//tco/2g9JCV08Fq
2    -Brak większej liczby reprezentacji choćby tyc...
Name: content_clean_1, dtype: object

#### clean hashtags and name tagged users

In [21]:
df_list =  [df_pl_tweets, df_en_tweets]

hashtag_regex = "(#)(((\S)|(\d))*)"
tag_regex = "(@)(((\S)|(\d))*)"

for df in df_list:
    df['content_clean_2'] = df['content_clean_1'].str.replace(pat=hashtag_regex, repl= "")
    df['content_clean_2'] = df['content_clean_2'].str.replace(pat=tag_regex, repl= "")

  df['content_clean_2'] = df['content_clean_1'].str.replace(pat=hashtag_regex, repl= "")
  df['content_clean_2'] = df['content_clean_2'].str.replace(pat=tag_regex, repl= "")
  df['content_clean_2'] = df['content_clean_1'].str.replace(pat=hashtag_regex, repl= "")
  df['content_clean_2'] = df['content_clean_2'].str.replace(pat=tag_regex, repl= "")


In [22]:
df_pl_tweets['content_clean_2'].head(3)

0    Patrząc po dzisiejszym twitterze brak powołani...
1                        awieeee https//tco/2g9JCV08Fq
2    -Brak większej liczby reprezentacji choćby tyc...
Name: content_clean_2, dtype: object

In [23]:
df_en_tweets['content_clean_2'].head(3)

0                         Very good project           
2     Scnrio Worldcup Ind Vs SACricket me Gautam Ga...
3       In the meantime  is building an empire with   
Name: content_clean_2, dtype: object

#### clean emojis

In [24]:
import re

In [25]:

emoji_regex = re.compile("["
        u"\U0001F600-\U0001F64F"  # emoticons
        u"\U0001F300-\U0001F5FF"  # symbols & pictographs
        u"\U0001F680-\U0001F6FF"  # transport & map symbols
        u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
        u"\U00002500-\U00002BEF"  # chinese char
        u"\U00002702-\U000027B0"
        u"\U00002702-\U000027B0"
        u"\U000024C2-\U0001F251"
        u"\U0001f926-\U0001f937"
        u"\U00010000-\U0010ffff"
        u"\u2640-\u2642" 
        u"\u2600-\u2B55"
        u"\u200d"
        u"\u23cf"
        u"\u23e9"
        u"\u231a"
        u"\ufe0f"  # dingbats
        u"\u3030"
                      "]+", re.UNICODE)

In [26]:
df_list =  [df_pl_tweets, df_en_tweets]

for df in df_list:
    df['content_clean_3'] = df['content_clean_2'].str.replace(pat=emoji_regex, repl= " ")

In [27]:
df_en_tweets['content_clean_3'].head(3)

0                         Very good project           
2     Scnrio Worldcup Ind Vs SACricket me Gautam Ga...
3       In the meantime  is building an empire with   
Name: content_clean_3, dtype: object

In [28]:
df_en_tweets.sample(5)

Unnamed: 0,ID,lang,content,location,content_clean_1,content_clean_2,content_clean_3
15612,15510,en,I wish Pakistan qualifies so that we can have ...,3902.0,I wish Pakistan qualifies so that we can have ...,I wish Pakistan qualifies so that we can have ...,I wish Pakistan qualifies so that we can have ...
764,755,en,Me eating knowing pak is knocked out of tye wo...,,Me eating knowing pak is knocked out of tye wo...,Me eating knowing pak is knocked out of tye wo...,Me eating knowing pak is knocked out of tye wo...
11699,11622,en,Still no Manuel Neuer for @FCBayernUS as he nu...,3499.0,Still no Manuel Neuer for @FCBayernUS as he nu...,Still no Manuel Neuer for as he nurses his sh...,Still no Manuel Neuer for as he nurses his sh...
14765,14664,en,"With the World Cup fast approaching, we are yo...",,With the World Cup fast approaching we are you...,With the World Cup fast approaching we are you...,With the World Cup fast approaching we are you...
2712,2692,en,Ye worldcup @imVkohli and @surya_14kumar k naa...,401.0,Ye worldcup @imVkohli and @surya_14kumar k naa...,Ye worldcup and k naam,Ye worldcup and k naam


In [29]:
df_pl_tweets['content_clean_3'].head(3)

0    Patrząc po dzisiejszym twitterze brak powołani...
1                        awieeee https//tco/2g9JCV08Fq
2    -Brak większej liczby reprezentacji choćby tyc...
Name: content_clean_3, dtype: object

In [30]:
df_pl_tweets.sample(5)

Unnamed: 0,ID,lang,content,location,content_clean_1,content_clean_2,content_clean_3
5238,5237,pl,#MedTwitter \nAkurat upadku maluszka nie należ...,,#MedTwitter Akurat upadku maluszka nie należy...,Akurat upadku maluszka nie należy bagatelizo...,Akurat upadku maluszka nie należy bagatelizo...
10661,10659,pl,@Ms_Rashfordy Weee hamna kocha pale,,@Ms_Rashfordy Weee hamna kocha pale,Weee hamna kocha pale,Weee hamna kocha pale
9756,9754,pl,@RobertLab_ Widzę sie tam między nimi 😍😍😍😍😍😍😍😍,1729.0,@RobertLab_ Widzę sie tam między nimi 😍😍😍😍😍😍😍😍,Widzę sie tam między nimi 😍😍😍😍😍😍😍😍,Widzę sie tam między nimi
13063,13061,pl,@Ms_Sylvia_Es Hejka Sylwuś 👋🤗jak Ci minął hell...,,@Ms_Sylvia_Es Hejka Sylwuś 👋🤗jak Ci minął hell...,Hejka Sylwuś 👋🤗jak Ci minął helloween-weekend,Hejka Sylwuś jak Ci minął helloween-weekend
4701,4700,pl,W dogrywce dla Argentyny trafili Kempes (łączn...,,W dogrywce dla Argentyny trafili Kempes (łączn...,W dogrywce dla Argentyny trafili Kempes (łączn...,W dogrywce dla Argentyny trafili Kempes (łączn...


#### Remove all 's from en tweets and quotations from polish tweets

In [31]:
df_list =  [df_pl_tweets, df_en_tweets]

df_en_tweets['content_clean_3'] = df_en_tweets['content_clean_3'].str.replace("’s", "")

for df in df_list:
    df['content_clean_4'] = df['content_clean_3'].str.replace("'", "")

In [32]:
df_en_tweets.head()

Unnamed: 0,ID,lang,content,location,content_clean_1,content_clean_2,content_clean_3,content_clean_4
0,0,en,@Futuball_io Very good project \n@ronibd_raj \...,,@Futuball_io Very good project @ronibd_raj @...,Very good project,Very good project,Very good project
2,1,en,@ikkanomics Scnrio. Worldcup Ind Vs SA:Cricket...,,@ikkanomics Scnrio Worldcup Ind Vs SACricket m...,Scnrio Worldcup Ind Vs SACricket me Gautam Ga...,Scnrio Worldcup Ind Vs SACricket me Gautam Ga...,Scnrio Worldcup Ind Vs SACricket me Gautam Ga...
3,2,en,@OGDfarmer In the meantime @flufworld is build...,,@OGDfarmer In the meantime @flufworld is build...,In the meantime is building an empire with,In the meantime is building an empire with,In the meantime is building an empire with
4,3,en,.@UN .@UNHumanRights .@FIFAcom .@FIFAWorldCup\...,1418.0,@UN @UNHumanRights @FIFAcom @FIFAWorldCup The ...,The should never be awarded to countries ...,The should never be awarded to countries ...,The should never be awarded to countries ...
5,4,en,Some serious questions to be asked if @England...,2075.0,Some serious questions to be asked if @England...,Some serious questions to be asked if don’t g...,Some serious questions to be asked if don’t g...,Some serious questions to be asked if don’t g...


#### Make all lowercase

In [33]:
for df in df_list:
    df['content_clean_5'] = df['content_clean_4'].str.lower()

In [34]:
df_en_tweets.head(2)

Unnamed: 0,ID,lang,content,location,content_clean_1,content_clean_2,content_clean_3,content_clean_4,content_clean_5
0,0,en,@Futuball_io Very good project \n@ronibd_raj \...,,@Futuball_io Very good project @ronibd_raj @...,Very good project,Very good project,Very good project,very good project
2,1,en,@ikkanomics Scnrio. Worldcup Ind Vs SA:Cricket...,,@ikkanomics Scnrio Worldcup Ind Vs SACricket m...,Scnrio Worldcup Ind Vs SACricket me Gautam Ga...,Scnrio Worldcup Ind Vs SACricket me Gautam Ga...,Scnrio Worldcup Ind Vs SACricket me Gautam Ga...,scnrio worldcup ind vs sacricket me gautam ga...


In [35]:
df_pl_tweets.head(2)

Unnamed: 0,ID,lang,content,location,content_clean_1,content_clean_2,content_clean_3,content_clean_4,content_clean_5
0,0,pl,Patrząc po dzisiejszym twitterze brak powołani...,462.0,Patrząc po dzisiejszym twitterze brak powołani...,Patrząc po dzisiejszym twitterze brak powołani...,Patrząc po dzisiejszym twitterze brak powołani...,Patrząc po dzisiejszym twitterze brak powołani...,patrząc po dzisiejszym twitterze brak powołani...
1,1,pl,awieeee https://t.co/2g9JCV08Fq,,awieeee https//tco/2g9JCV08Fq,awieeee https//tco/2g9JCV08Fq,awieeee https//tco/2g9JCV08Fq,awieeee https//tco/2g9JCV08Fq,awieeee https//tco/2g9jcv08fq


#### Lemmatize

In [36]:
lemmatizer = WordNetLemmatizer()

In [37]:
df_en_tweets.head()

Unnamed: 0,ID,lang,content,location,content_clean_1,content_clean_2,content_clean_3,content_clean_4,content_clean_5
0,0,en,@Futuball_io Very good project \n@ronibd_raj \...,,@Futuball_io Very good project @ronibd_raj @...,Very good project,Very good project,Very good project,very good project
2,1,en,@ikkanomics Scnrio. Worldcup Ind Vs SA:Cricket...,,@ikkanomics Scnrio Worldcup Ind Vs SACricket m...,Scnrio Worldcup Ind Vs SACricket me Gautam Ga...,Scnrio Worldcup Ind Vs SACricket me Gautam Ga...,Scnrio Worldcup Ind Vs SACricket me Gautam Ga...,scnrio worldcup ind vs sacricket me gautam ga...
3,2,en,@OGDfarmer In the meantime @flufworld is build...,,@OGDfarmer In the meantime @flufworld is build...,In the meantime is building an empire with,In the meantime is building an empire with,In the meantime is building an empire with,in the meantime is building an empire with
4,3,en,.@UN .@UNHumanRights .@FIFAcom .@FIFAWorldCup\...,1418.0,@UN @UNHumanRights @FIFAcom @FIFAWorldCup The ...,The should never be awarded to countries ...,The should never be awarded to countries ...,The should never be awarded to countries ...,the should never be awarded to countries ...
5,4,en,Some serious questions to be asked if @England...,2075.0,Some serious questions to be asked if @England...,Some serious questions to be asked if don’t g...,Some serious questions to be asked if don’t g...,Some serious questions to be asked if don’t g...,some serious questions to be asked if don’t g...


#### Lemmatize and remove stopwords

In [39]:
stop_words_pl = sw.get_stop_words(language="polish")
stop_words_en = sw.get_stop_words(language="english")

In [40]:
df_en_tweets['content_clean_6'] = df_en_tweets['content_clean_5']

lemmatized_texts = []

for row in df_en_tweets['content_clean_6']:
    text = row
    word_tokens = text.split(" ")

    lemmas  = []
    lemmatized_text = []

    for word in word_tokens:
        if word not in stop_words_en:
            lemmas.append(lemmatizer.lemmatize(word, pos = 'v'))

    lemmatized_text = " ".join(lemmas)

    lemmatized_texts.append(lemmatized_text)
    
df_en_tweets['content_clean_6']= lemmatized_texts

In [41]:
df_en_tweets.head()

Unnamed: 0,ID,lang,content,location,content_clean_1,content_clean_2,content_clean_3,content_clean_4,content_clean_5,content_clean_6
0,0,en,@Futuball_io Very good project \n@ronibd_raj \...,,@Futuball_io Very good project @ronibd_raj @...,Very good project,Very good project,Very good project,very good project,good project
2,1,en,@ikkanomics Scnrio. Worldcup Ind Vs SA:Cricket...,,@ikkanomics Scnrio Worldcup Ind Vs SACricket m...,Scnrio Worldcup Ind Vs SACricket me Gautam Ga...,Scnrio Worldcup Ind Vs SACricket me Gautam Ga...,Scnrio Worldcup Ind Vs SACricket me Gautam Ga...,scnrio worldcup ind vs sacricket me gautam ga...,scnrio worldcup ind vs sacricket gautam gambh...
3,2,en,@OGDfarmer In the meantime @flufworld is build...,,@OGDfarmer In the meantime @flufworld is build...,In the meantime is building an empire with,In the meantime is building an empire with,In the meantime is building an empire with,in the meantime is building an empire with,meantime build empire
4,3,en,.@UN .@UNHumanRights .@FIFAcom .@FIFAWorldCup\...,1418.0,@UN @UNHumanRights @FIFAcom @FIFAWorldCup The ...,The should never be awarded to countries ...,The should never be awarded to countries ...,The should never be awarded to countries ...,the should never be awarded to countries ...,never award countries regard think will ...
5,4,en,Some serious questions to be asked if @England...,2075.0,Some serious questions to be asked if @England...,Some serious questions to be asked if don’t g...,Some serious questions to be asked if don’t g...,Some serious questions to be asked if don’t g...,some serious questions to be asked if don’t g...,serious question ask don’t get sellout quarte...


In [77]:
df_pl_tweets['content_clean_6'] = df_pl_tweets['content_clean_5']

lemmatized_texts = []

for row in df_pl_tweets['content_clean_6']:
    text = row
    word_tokens = text.split(" ")

    lemmas  = []
    lemmatized_text = []

    for word in word_tokens:
        if word not in stop_words_pl:
            lemmas.append(lemmatizer.lemmatize(word, pos = 'v'))

    lemmatized_text = " ".join(lemmas)

    lemmatized_texts.append(lemmatized_text)
    
df_pl_tweets['content_clean_6']= lemmatized_texts

In [43]:
df_pl_tweets.head()

Unnamed: 0,ID,lang,content,location,content_clean_1,content_clean_2,content_clean_3,content_clean_4,content_clean_5,content_clean_6
0,0,pl,Patrząc po dzisiejszym twitterze brak powołani...,462.0,Patrząc po dzisiejszym twitterze brak powołani...,Patrząc po dzisiejszym twitterze brak powołani...,Patrząc po dzisiejszym twitterze brak powołani...,Patrząc po dzisiejszym twitterze brak powołani...,patrząc po dzisiejszym twitterze brak powołani...,patrząc dzisiejszym twitterze brak powołania m...
1,1,pl,awieeee https://t.co/2g9JCV08Fq,,awieeee https//tco/2g9JCV08Fq,awieeee https//tco/2g9JCV08Fq,awieeee https//tco/2g9JCV08Fq,awieeee https//tco/2g9JCV08Fq,awieeee https//tco/2g9jcv08fq,awieeee https//tco/2g9jcv08fq
2,2,pl,"-Brak większej liczby reprezentacji, choćby ty...",1530.0,-Brak większej liczby reprezentacji choćby tyc...,-Brak większej liczby reprezentacji choćby tyc...,-Brak większej liczby reprezentacji choćby tyc...,-Brak większej liczby reprezentacji choćby tyc...,-brak większej liczby reprezentacji choćby tyc...,-brak większej liczby reprezentacji choćby tyc...
3,3,pl,@OmzRi @PK_GOV_PL @podlaskaPolicja @MS_GOV_PL ...,,@OmzRi @PK_GOV_PL @podlaskaPolicja @MS_GOV_PL ...,To ten sam który uciekł przed wymiarem sp...,To ten sam który uciekł przed wymiarem sp...,To ten sam który uciekł przed wymiarem sp...,to ten sam który uciekł przed wymiarem sp...,który uciekł wymiarem sprawiedliwości aha
4,4,pl,@gggfx129 przecież to będzie kurwa żart jak na...,,@gggfx129 przecież to będzie kurwa żart jak na...,przecież to będzie kurwa żart jak na mundial ...,przecież to będzie kurwa żart jak na mundial ...,przecież to będzie kurwa żart jak na mundial ...,przecież to będzie kurwa żart jak na mundial ...,przecież będzie kurwa żart mundial pojedzie d...


### Vectorize

In [82]:
ngram_range = (1,2)
min_df = 5

tfidf_en = TfidfVectorizer(encoding='utf-8',
                        ngram_range=ngram_range,
                        stop_words=None,
                        lowercase=False,
                        min_df=min_df,
                        norm='l2',
                        sublinear_tf=True)


In [103]:
df_en = tfidf_en.fit_transform(df_en_tweets["content_clean_6"]).toarray()
print(df_en.shape)
print(df_en[0].shape)
print(df_en[0])

(19867, 8309)
(8309,)
[0. 0. 0. ... 0. 0. 0.]


In [92]:
print(tfidf_en.get_feature_names_out()[:10])

['01' '10' '10 overs' '10 players' '10 real' '100' '100 day' '100 lovin'
 '1000' '1000 run']
