In [1]:
import pandas as pd
import sklearn
import sklearn.utils
from pandarallel import pandarallel
from tqdm.auto import tqdm

In [2]:
pandarallel.initialize(nb_workers=10,progress_bar=True)

INFO: Pandarallel will run on 10 workers.
INFO: Pandarallel will use standard multiprocessing data transfer (pipe) to transfer data between the main process and workers.

https://nalepae.github.io/pandarallel/troubleshooting/


In [3]:
splits = {'train': 'train.csv', 'test': 'test.csv'}
df_train = pd.read_csv("hf://datasets/winvoker/turkish-sentiment-analysis-dataset/" + splits["train"])
df_train.head()

Unnamed: 0,text,label,dataset
0,ürünü hepsiburadadan alalı 3 hafta oldu. orjin...,Positive,urun_yorumlari
1,"ürünlerden çok memnunum, kesinlikle herkese ta...",Positive,urun_yorumlari
2,"hızlı kargo, temiz alışveriş.teşekkür ederim.",Positive,urun_yorumlari
3,Çünkü aranan tapınak bu bölgededir .,Notr,wiki
4,bu telefonu başlıca alma nedenlerim ise elimde...,Positive,urun_yorumlari


In [4]:
df_train = df_train.convert_dtypes(convert_string=True)

In [5]:
df_train = df_train[df_train["text"]!=""]

In [6]:
print(len(df_train[df_train["label"]=="Notr"]))
print(len(df_train[df_train["label"]=="Positive"]))
print(len(df_train[df_train["label"]=="Negative"]))

153825
235949
50905


In [7]:
df_train_positive = df_train[df_train["label"]=="Positive"].sample(50905,random_state=42)
df_train_negative = df_train[df_train["label"]=="Negative"].sample(50905,random_state=42)
df_train_notr = df_train[df_train["label"]=="Notr"].sample(50905,random_state=42)
df_normalized_train : pd.DataFrame = sklearn.utils.shuffle(pd.concat((df_train_negative,df_train_positive,df_train_notr),ignore_index=True),random_state=42)

In [8]:
df_test = pd.read_csv("hf://datasets/winvoker/turkish-sentiment-analysis-dataset/" + splits["test"])
df_test.head()

Unnamed: 0,text,label,dataset
0,Kral akbaba dikkat çekici renklere sahiptir .,Notr,wiki
1,ısrarla korkutmayı başarıyor. sanki korku çok...,Positive,HUMIR
2,Neşe ve Üzüntü köprünün kırılmaya başlamasıyla...,Notr,wiki
3,i phone 5 ten sonra gene 4'' ekranı tercih ett...,Positive,urun_yorumlari
4,Beşinci sezonda diziye yeni oyuncular katıldı .,Notr,wiki


In [9]:
df_test = df_test.convert_dtypes(convert_string=True)

# Text Preprocessing

In [10]:
import re

### Convert to lower case

In [11]:
df_normalized_train['text'] = [token.lower() for token in df_normalized_train['text']]
df_normalized_train.head(5)

Unnamed: 0,text,label,dataset
85855,"öyle böyle bir büyüklük değil, bazı hareketler...",Positive,urun_yorumlari
94508,sıvı detarjanların hemen hemen hepsini denedim...,Positive,urun_yorumlari
131917,araçlar hem ülke içinde hem de ülke dışında sa...,Notr,wiki
71544,bosch-siemens yerli üretimi makinamızda rakibi...,Positive,urun_yorumlari
41678,topaklanmasi cok kotu 2 tane kedimiz var temiz...,Negative,urun_yorumlari


### Remove @ mentions and hyperlinks

In [12]:
found = df_normalized_train[df_normalized_train['text'].str.contains('@')]
found.count()

text       63
label      63
dataset    63
dtype: int64

In [13]:
df_normalized_train.info()

<class 'pandas.core.frame.DataFrame'>
Index: 152715 entries, 85855 to 121958
Data columns (total 3 columns):
 #   Column   Non-Null Count   Dtype 
---  ------   --------------   ----- 
 0   text     152715 non-null  object
 1   label    152715 non-null  string
 2   dataset  152715 non-null  string
dtypes: object(1), string(2)
memory usage: 4.7+ MB


In [14]:
df_normalized_train['text'] = df_normalized_train['text'].replace('@[A-Za-z0-9]+', '', regex=True).replace('@[A-Za-z0-9]+', '', regex=True)
found = df_normalized_train[df_normalized_train['text'].str.contains('@')]
found.count()

text       9
label      9
dataset    9
dtype: int64

In [15]:
found = df_normalized_train[df_normalized_train['text'].str.contains('http')]
found.count()

text       35
label      35
dataset    35
dtype: int64

In [16]:
df_normalized_train['text'] = df_normalized_train['text'].replace(r'http\S+', '', regex=True).replace(r'www\S+', '', regex=True)
found = df_normalized_train[df_normalized_train['text'].str.contains('http')]
found.count()

text       0
label      0
dataset    0
dtype: int64

In [17]:
df_normalized_train.shape

(152715, 3)

### Remove Punctations & Emojies & Numbers

In [18]:
sentences = df_normalized_train['text'].copy()
new_sent = []
i = 0
for sentence in sentences:
    new_sentence = re.sub('[0-9]+', '', sentence)
    new_sent.append(new_sentence)
    i += 1
    
df_normalized_train['text'] = new_sent
df_normalized_train['text'].head(5)

85855     öyle böyle bir büyüklük değil, bazı hareketler...
94508     sıvı detarjanların hemen hemen hepsini denedim...
131917    araçlar hem ülke içinde hem de ülke dışında sa...
71544     bosch-siemens yerli üretimi makinamızda rakibi...
41678     topaklanmasi cok kotu  tane kedimiz var temizl...
Name: text, dtype: object

In [19]:
import string

table = str.maketrans('', '', string.punctuation)
sentences = df_normalized_train['text'].copy()
new_sent = []
for sentence in sentences:
    words = sentence.split()
    stripped = [w.translate(table) for w in words]
    new_sent.append(stripped)

In [20]:
df_normalized_train['text'] = new_sent
df_normalized_train['text'].head(5)

85855     [öyle, böyle, bir, büyüklük, değil, bazı, hare...
94508     [sıvı, detarjanların, hemen, hemen, hepsini, d...
131917    [araçlar, hem, ülke, içinde, hem, de, ülke, dı...
71544     [boschsiemens, yerli, üretimi, makinamızda, ra...
41678     [topaklanmasi, cok, kotu, tane, kedimiz, var, ...
Name: text, dtype: object

In [21]:
def join_str(text):
    return " ".join(text)

In [22]:
df_normalized_train["text"] = df_normalized_train["text"].parallel_map(join_str)

VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=15272), Label(value='0 / 15272')))…

In [23]:
df_normalized_train = df_normalized_train.convert_dtypes(convert_string=True)

# Zemberek-NLP

## Tokenization

In [24]:
import time
import logging

from zemberek import (
    TurkishSentenceNormalizer,
    TurkishSentenceExtractor,
    TurkishMorphology,
    TurkishSpellChecker
)

logger = logging.getLogger(__name__)

morphology = TurkishMorphology.create_with_defaults()
normalizer = TurkishSentenceNormalizer(morphology)
extractor = TurkishSentenceExtractor()
spell_checker = TurkishSpellChecker(morphology)

2024-07-16 19:01:51,085 - zemberek.morphology.turkish_morphology - INFO
Msg: TurkishMorphology instance initialized in 2.7105343341827393



KeyboardInterrupt: 

### Sentence Normalization

In [None]:
df_normalized_train["text"]

85855     öyle böyle bir büyüklük değil bazı hareketleri...
94508     sıvı detarjanların hemen hemen hepsini denedim...
131917    araçlar hem ülke içinde hem de ülke dışında sa...
71544     boschsiemens yerli üretimi makinamızda rakibi ...
41678     topaklanmasi cok kotu tane kedimiz var temizle...
                                ...                        
119879                   i̇ç anadolu bölgelerinde söylenir 
103694    babasının hediye ettiği gitar ile müzikle erke...
131932    kapanış nedeni okur sayısının azalmasına ve so...
146867                        savaş çıkınca askerden kaçtı 
121958    bilgi arama modelleri çeşitli unsurlardan etki...
Name: text, Length: 152715, dtype: string

In [None]:
df_train_1 = df_normalized_train.iloc[:10000,:]
df_train_2 = df_normalized_train.iloc[10000:20000,:]
df_train_3 = df_normalized_train.iloc[20000:30000,:]
df_train_4 = df_normalized_train.iloc[30000:40000,:]
df_train_5 = df_normalized_train.iloc[40000:50000,:]
df_train_6 = df_normalized_train.iloc[50000:60000,:]
df_train_7 = df_normalized_train.iloc[60000:70000,:]
df_train_8 = df_normalized_train.iloc[70000:80000,:]
df_train_9 = df_normalized_train.iloc[80000:90000,:]
df_train_10 = df_normalized_train.iloc[90000:100000,:]
df_train_11 = df_normalized_train.iloc[100000:110000,:]
df_train_12 = df_normalized_train.iloc[110000:120000,:]
df_train_13 = df_normalized_train.iloc[120000:130000,:]
df_train_14 = df_normalized_train.iloc[130000:140000,:]
df_train_15 = df_normalized_train.iloc[140000:,:]
dfs = [df_train_1,df_train_2,df_train_3,df_train_4,df_train_5,df_train_6,df_train_7,df_train_8,df_train_9,df_train_10,df_train_11,df_train_12,df_train_13,df_train_14,df_train_15]

In [None]:
def correct_spells(text:str):

    words = text.split(" ")
    for i,word in enumerate(words):
        try:
            words[i] = spell_checker.suggest_for_word(word)[0]
        except BaseException:
            continue
    return " ".join(words)
    

In [None]:
for i,df in tqdm(enumerate(dfs)):
    dfs[i]["text"] = df["text"].parallel_map(correct_spells)

VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=1000), Label(value='0 / 1000'))), …

KeyboardInterrupt: 

In [None]:
def extract_sentence(text):
    return extractor.from_paragraph(text)

In [None]:
for i,df in tqdm(enumerate(dfs)):
    dfs[i]["text"] = df["text"].parallel_map(extract_sentence)

MAP DONE:   0%|          | 0/20 [00:00<?, ?it/s]

In [None]:
df_normalized_train = df_normalized_train[df_normalized_train["text"]!=""]

In [None]:
def normalize_long_text(paragraph: str) -> str:
    result = []
    for sentence in paragraph:
        result.append(normalizer.normalize(sentence))
    return " ".join(result)

In [None]:
for i,df in tqdm(enumerate(dfs)):
    dfs[i]["text"] = df["text"].parallel_map(normalize_long_text)

MAP DONE:   0%|          | 0/20 [00:00<?, ?it/s]

  d = (d ^ a) * MultiLevelMphf.HASH_MULTIPLIER


### Stopwords

In [None]:
from nltk.corpus import stopwords
import re
import nltk
nltk.download('stopwords')
stops = set(stopwords.words('turkish'))
print(stops)

[nltk_data] Downloading package stopwords to
[nltk_data]     /home/musasina/nltk_data...


{'mı', 'aslında', 'ya', 'mü', 'sanki', 'gibi', 'da', 'birşey', 'diye', 'biri', 'çok', 'yani', 'ki', 'en', 'nerede', 've', 'hep', 'de', 'için', 'nasıl', 'az', 'mu', 'defa', 'ne', 'bazı', 'şu', 'veya', 'niye', 'nerde', 'hem', 'hepsi', 'acaba', 'ile', 'ise', 'nereye', 'neden', 'niçin', 'birkaç', 'ama', 'eğer', 'çünkü', 'her', 'kez', 'kim', 'belki', 'şey', 'siz', 'bu', 'o', 'hiç', 'biz', 'tüm', 'daha'}


[nltk_data]   Unzipping corpora/stopwords.zip.


In [None]:
def clear_stop_words(sentence):
    return [word for word in sentence.split(" ") if word not in stops]

In [None]:
for i,df in tqdm(enumerate(dfs)):
    dfs[i]["text"] = df["text"].parallel_map(clear_stop_words)

ValueError: number sections must be larger than 0.

### Lemmatization

In [None]:
for i,df in tqdm(enumerate(dfs)):
    dfs[i]["text"] = df["text"].parallel_map(join_str)

In [None]:
def remove_upper_punctuation(text):
    return text.replace('"', '').replace("’", '').replace("'", '').replace("”", '')

In [None]:
for i,df in tqdm(enumerate(dfs)):
    dfs[i]["text"] = df["text"].parallel_map(remove_upper_punctuation)

In [None]:
def sep_text(text):
    return text.split(" ")

In [None]:
for i,df in tqdm(enumerate(dfs)):
    dfs[i]["text"] = df["text"].parallel_map(sep_text)

In [None]:
import zeyrek

analyzer = zeyrek.MorphAnalyzer()

def lemmatize_sent(text):
    return " ".join([analyzer.lemmatize(word)[0][1][0] for word in text])

In [None]:
for i,df in tqdm(enumerate(dfs)):
    dfs[i]["text"] = df["text"].parallel_map(lemmatize_sent)

In [None]:
for i,df in tqdm(enumerate(dfs)):
    dfs[i]["text"] = df["text"].parallel_map(lambda x : x.lower())

### Remove Rare Words

In [None]:
df_normalized_train = pd.concat(dfs,ignore_index=True)

In [None]:
freq = pd.Series(' '.join(df_normalized_train['text']).split()).value_counts()
less_freq = list(freq[freq == 1].index)

In [None]:
df_train_1 = df_normalized_train.iloc[:10000,:]
df_train_2 = df_normalized_train.iloc[10000:20000,:]
df_train_3 = df_normalized_train.iloc[20000:30000,:]
df_train_4 = df_normalized_train.iloc[30000:40000,:]
df_train_5 = df_normalized_train.iloc[40000:50000,:]
df_train_6 = df_normalized_train.iloc[50000:60000,:]
df_train_7 = df_normalized_train.iloc[60000:70000,:]
df_train_8 = df_normalized_train.iloc[70000:80000,:]
df_train_9 = df_normalized_train.iloc[80000:90000,:]
df_train_10 = df_normalized_train.iloc[90000:100000,:]
df_train_11 = df_normalized_train.iloc[100000:110000,:]
df_train_12 = df_normalized_train.iloc[110000:120000,:]
df_train_13 = df_normalized_train.iloc[120000:130000,:]
df_train_14 = df_normalized_train.iloc[130000:140000,:]
df_train_15 = df_normalized_train.iloc[140000:,:]
dfs = [df_train_1,df_train_2,df_train_3,df_train_4,df_train_5,df_train_6,df_train_7,df_train_8,df_train_9,df_train_10,df_train_11,df_train_12,df_train_13,df_train_14,df_train_15]

In [None]:
for i,df in tqdm(enumerate(dfs)):
    dfs[i]["text"] = df["text"].parallel_map(lambda x: " ".join(x for x in x.split() if x not in less_freq))

In [None]:
import unidecode

turkish_chars = "ÇçĞğıİÖöŞşÜü"
normal_chars = unidecode.unidecode(turkish_chars)

def change_turkish_chars(text:str):
    for char,turkish_char in zip(normal_chars,turkish_chars):
        text = text.replace(turkish_char,char)
    return text

ModuleNotFoundError: No module named 'unidecode'

In [None]:
for i,df in tqdm(enumerate(dfs)):
    dfs[i]["text"] = df["text"].parallel_map(change_turkish_chars)

In [None]:
df_normalized_train = pd.concat(dfs,ignore_index=True)

In [None]:
df_normalized_train.to_csv("./datasets/train,csv")
df_test.to_csv("./datasets/test.csv")