In [1]:
import pandas as pd
df=pd.read_csv('all_annotated.tsv', sep = '\t')
df_text=df[['Tweet']]
df_text.head()

Unnamed: 0,Tweet
0,Bugün bulusmami lazimdiii
1,Volkan konak adami tribe sokar yemin ederim :D
2,Bed
3,I felt my first flash of violence at some fool...
4,Ladies drink and get in free till 10:30


# 1. Lowercasing

In [3]:
df_text['Tweet']=df_text['Tweet'].str.lower()
df_text.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_text['Tweet']=df_text['Tweet'].str.lower()


Unnamed: 0,Tweet
0,bugün bulusmami lazimdiii
1,volkan konak adami tribe sokar yemin ederim :d
2,bed
3,i felt my first flash of violence at some fool...
4,ladies drink and get in free till 10:30


# 2. Remove Extra Whitespaces

In [8]:
def remove_whitespace(text):
    return  " ".join(text.split())

df_text['Tweet']=df['Tweet'].apply(remove_whitespace)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_text['Tweet']=df['Tweet'].apply(remove_whitespace)


# 3. Tokenization

In [9]:
!pip install nltk

Collecting nltk
  Downloading nltk-3.8.1-py3-none-any.whl (1.5 MB)
     ---------------------------------------- 1.5/1.5 MB 3.2 MB/s eta 0:00:00
Collecting regex>=2021.8.3
  Downloading regex-2023.12.25-cp39-cp39-win_amd64.whl (269 kB)
     -------------------------------------- 269.5/269.5 kB 5.5 MB/s eta 0:00:00
Installing collected packages: regex, nltk
Successfully installed nltk-3.8.1 regex-2023.12.25


In [12]:
from nltk import word_tokenize
import nltk

nltk.download('punkt')
df_text['Tweet']=df_text['Tweet'].apply(lambda X: word_tokenize(X))
df_text.head()

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\blodg\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping tokenizers\punkt.zip.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_text['Tweet']=df_text['Tweet'].apply(lambda X: word_tokenize(X))


Unnamed: 0,Tweet
0,"[Bugün, bulusmami, lazimdiii]"
1,"[Volkan, konak, adami, tribe, sokar, yemin, ed..."
2,[Bed]
3,"[I, felt, my, first, flash, of, violence, at, ..."
4,"[Ladies, drink, and, get, in, free, till, 10:30]"


# 4. Spelling Correction

In [13]:
!pip install pyspellchecker

Collecting pyspellchecker
  Downloading pyspellchecker-0.8.1-py3-none-any.whl (6.8 MB)
     ---------------------------------------- 6.8/6.8 MB 5.3 MB/s eta 0:00:00
Installing collected packages: pyspellchecker
Successfully installed pyspellchecker-0.8.1


In [19]:
from spellchecker import SpellChecker
def spell_check(text):
    
    result = []
    spell = SpellChecker()
    for word in text:
        correct_word = spell.correction(word)
        result.append(correct_word)
    
    return result


df_text['Tweet'] = df_text['Tweet'].apply(spell_check)
df_text.head()


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_text['Tweet'] = df_text['Tweet'].apply(spell_check)


Unnamed: 0,Tweet
0,"[burn, None, None]"
1,"[vodka, knock, damn, tribe, solar, hemin, eeri..."
2,[Bed]
3,"[I, felt, my, first, flash, of, violence, at, ..."
4,"[Ladies, drink, and, get, in, free, till, None]"


In [20]:
df_text['Tweet']

0                                       [burn, None, None]
1        [vodka, knock, damn, tribe, solar, hemin, eeri...
2                                                    [Bed]
3        [I, felt, my, first, flash, of, violence, at, ...
4          [Ladies, drink, and, get, in, free, till, None]
                               ...                        
10497    [I, am, at, @, None, in, can, ,, sampan, we, @...
10498    [El, lido, ,, mica, to, north, de, paladin, ,,...
10499    [None, None, man, pah, #, love, @, terra, koto...
10500    [None, @, batch, federal, lowest, hates, :, None]
10501    [i, manta, catering, None, naira, None, Ana, h...
Name: Tweet, Length: 10502, dtype: object

In [21]:
df_text.to_csv('./tweet.tsv', sep="\t")
df_text['Tweet'].to_csv('./tweet1.tsv', sep="\t")

# 5. Removing Stopwords

In [30]:
from nltk.corpus import stopwords
# nltk.download('stopwords')
print(stopwords.words('english'))
en_stopwords = stopwords.words('english')

def remove_stopwords(text):
    result = []
    for token in text:
        if token is None:
            token = ''
        elif token not in en_stopwords:
            result.append(token)
            
    return result

df_text['Tweet'] = df_text['Tweet'].apply(remove_stopwords)
df_text.head()

['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', "that'll", 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such', 'no', 'nor', 'not', 'only', 'own', 'same', 'so', 'than', '

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_text['Tweet'] = df_text['Tweet'].apply(remove_stopwords)


Unnamed: 0,Tweet
0,[burn]
1,"[vodka, knock, damn, tribe, solar, hemin, eeri..."
2,[Bed]
3,"[I, felt, first, flash, violence, fool, bumped..."
4,"[Ladies, drink, get, free, till]"


# 6. Removing Punctuations

In [31]:
from nltk.tokenize import RegexpTokenizer

def remove_punct(text):
    
    tokenizer = RegexpTokenizer(r"\w+")
    lst=tokenizer.tokenize(' '.join(text))
    return lst

df_text['Tweet'] = df_text['Tweet'].apply(remove_punct)
df_text.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_text['Tweet'] = df_text['Tweet'].apply(remove_punct)


Unnamed: 0,Tweet
0,[burn]
1,"[vodka, knock, damn, tribe, solar, hemin, eeri..."
2,[Bed]
3,"[I, felt, first, flash, violence, fool, bumped..."
4,"[Ladies, drink, get, free, till]"


# 7. Removing Frequent Words

In [32]:
from nltk import FreqDist

def frequent_words(df):
    
    lst=[]
    for text in df.values:
        lst+=text[0]
    fdist=FreqDist(lst)
    return fdist.most_common(10)
frequent_words(df_text)

[('hates', 3095),
 ('I', 1961),
 ('help', 1266),
 ('de', 685),
 ('due', 525),
 ('la', 350),
 ('e', 330),
 ('amp', 255),
 ('ya', 223),
 ('lot', 221)]

In [33]:
freq_words = frequent_words(df_text)

lst = []
for a,b in freq_words:
    lst.append(b)

def remove_freq_words(text):
    
    result=[]
    for item in text:
        if item not in lst:
            result.append(item)
    
    return result
    
df_text['Tweet']=df_text['Tweet'].apply(remove_freq_words)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_text['Tweet']=df_text['Tweet'].apply(remove_freq_words)


# 8.Lemmatization

In [41]:
nltk.download('punkt')
nltk.download('all')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\blodg\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading collection 'all'
[nltk_data]    | 
[nltk_data]    | Downloading package abc to
[nltk_data]    |     C:\Users\blodg\AppData\Roaming\nltk_data...
[nltk_data]    |   Unzipping corpora\abc.zip.
[nltk_data]    | Downloading package alpino to
[nltk_data]    |     C:\Users\blodg\AppData\Roaming\nltk_data...
[nltk_data]    |   Unzipping corpora\alpino.zip.
[nltk_data]    | Downloading package averaged_perceptron_tagger to
[nltk_data]    |     C:\Users\blodg\AppData\Roaming\nltk_data...
[nltk_data]    |   Unzipping taggers\averaged_perceptron_tagger.zip.
[nltk_data]    | Downloading package averaged_perceptron_tagger_ru to
[nltk_data]    |     C:\Users\blodg\AppData\Roaming\nltk_data...
[nltk_data]    |   Unzipping
[nltk_data]    |       taggers\averaged_perceptron_tagger_ru.zip.
[nltk_data]    | Downloading package basqu

True

In [43]:
from nltk.stem import WordNetLemmatizer
from nltk import word_tokenize,pos_tag

def lemmatization(text):
    
    result=[]
    wordnet = WordNetLemmatizer()
    for token,tag in pos_tag(text):
        pos=tag[0].lower()
        
        if pos not in ['a', 'r', 'n', 'v']:
            pos='n'
            
        result.append(wordnet.lemmatize(token,pos))
    
    return result


In [44]:
# text = ['running','ran','runs'] 
# lemmatization(text)

['run', 'ran', 'run']

In [45]:
df_text['Tweet']=df_text['Tweet'].apply(lemmatization)
df_text.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_text['Tweet']=df_text['Tweet'].apply(lemmatization)


Unnamed: 0,Tweet
0,[burn]
1,"[vodka, knock, damn, tribe, solar, hemin, eeri..."
2,[Bed]
3,"[I, felt, first, flash, violence, fool, bump, ..."
4,"[Ladies, drink, get, free, till]"


# 9. Stemming

In [46]:
from nltk.stem import PorterStemmer

def stemming(text):
    porter = PorterStemmer()
    
    result=[]
    for word in text:
        result.append(porter.stem(word))
    return result

In [47]:
df_text['Tweet']=df_text['Tweet'].apply(stemming)
df_text.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_text['Tweet']=df_text['Tweet'].apply(stemming)


Unnamed: 0,Tweet
0,[burn]
1,"[vodka, knock, damn, tribe, solar, hemin, eeri..."
2,[bed]
3,"[i, felt, first, flash, violenc, fool, bump, i..."
4,"[ladi, drink, get, free, till]"


# 10.Removal of Tags

In [48]:
import re
def remove_tag(text):
    
    text=' '.join(text)
    html_pattern = re.compile('<.*?>')
    return html_pattern.sub(r'', text)

df_text['Tweet'] = df_text['Tweet'].apply(remove_tag)
df_text.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_text['Tweet'] = df_text['Tweet'].apply(remove_tag)


Unnamed: 0,Tweet
0,burn
1,vodka knock damn tribe solar hemin eeri d
2,bed
3,i felt first flash violenc fool bump i piti fool
4,ladi drink get free till


# 11. Removal of URLs

In [49]:
def remove_urls(text):
    url_pattern = re.compile(r'https?://\S+|www\.\S+')
    return url_pattern.sub(r'', text)

df_text['Tweet'] = df_text['Tweet'].apply(remove_urls)
df_text.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_text['Tweet'] = df_text['Tweet'].apply(remove_urls)


Unnamed: 0,Tweet
0,burn
1,vodka knock damn tribe solar hemin eeri d
2,bed
3,i felt first flash violenc fool bump i piti fool
4,ladi drink get free till


In [51]:
df_text.to_csv('./result.tsv', sep="\t")