In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import re
import string

import warnings
warnings.filterwarnings("ignore")

# Make sure to replace the path with the path to your specific file
file_path = '/content/drive/MyDrive/data/Data.csv'
df = pd.read_csv(file_path)

def preprocess(df):

  df = df[['text','sentiment']]
  texts = df['text']

  # drop same texts
  df.drop_duplicates(subset = ['text'], inplace = True)

  df['sentiment'] = df['sentiment'].str.lower()
  df['sentiment'] = df['sentiment'].map({'sexist': 'sexist', 'insult': 'insult', 'racist': 'racist', 'profanity': 'profanity'
  , 'not-offensive': 'notoffensive', "notoffensive": "notoffensive", "not offensive" : "notoffensive","ınsult": "insult"})
    # Create dummy columns without dropping any columns
  df_dummies = pd.get_dummies(df['sentiment'], drop_first=False)

  # Concatenate the dummy columns with the original DataFrame
  df = pd.concat([df, df_dummies], axis=1)
  df = df.dropna(subset=['sentiment'])
  df.drop(columns = ['sentiment'],inplace = True)

  # Remove words that contain a '#' in them entirely
  df["text"] = df["text"].apply(lambda text: re.sub(r"#\S+", "", text))

  # Remove rows with less than 5 characters
  df = df[df['text'].str.len() > 5]

  # Remove rows with only punctuation
  df = df[~df['text'].str.contains(r'^[\W_]+$')]

  # Remove rows with only whitespace
  df = df[~df['text'].str.isspace()]

  # Remove rows with only digits
  df = df[~df['text'].str.isdigit()]

  # Apply the function to the "text" column
  df["text"] = df["text"].apply(lambda text: re.sub(r"@\S+", "", text))

  # Remove punctuation from the 'text' column
  punctuation_translator = str.maketrans('', '', string.punctuation)
  df["text"] = df["text"].apply(lambda text: text.translate(punctuation_translator))


  return df
  # df.to_csv("/content/drive/MyDrive/data/Preprocessed_data.csv", index=False)

df = preprocess(df)

In [None]:
df

Unnamed: 0,text,insult,notoffensive,profanity,racist,sexist
0,hemen cep bank yapıyorum ozaman siteye çökücez...,0,1,0,0,0
1,geçmiş olsun fenerin anasini sik,0,0,1,0,0
2,migros adet bilet var ilgilenen varsa yazsın,0,1,0,0,0
3,çok hızlı gidenlere yavaş demek için geride du...,0,1,0,0,0
4,nolu ile fetöcü öğrencilerin tüm borcu silindi...,0,1,0,0,0
...,...,...,...,...,...,...
81792,evet o zamanda söylemiştin ben çıkıyorum diye,0,1,0,0,0
81793,arap levhası bizi itmese bu depremler hiç olma...,0,0,0,1,0
81795,diyorum ki aileleri bunların başını zorla başı...,0,1,0,0,0
81797,hastanedekiler ismini yağmur koy demişler aney...,0,1,0,0,0


In [None]:
!pip install --upgrade turkishnlp

import turkishnlp
from turkishnlp import detector

obj = detector.TurkishNLP()
obj.download()

obj.create_word_set()

Download is successful


In [None]:
# Iterate over the first 10 texts in the DataFrame
for index, text in enumerate(df['text'].head(10)):
    lwords = obj.list_words(text)
    corrected_text = obj.auto_correct(lwords)
    corrected_text_str = " ".join(corrected_text)  # Convert the list of words back into a string

    # Print the original and corrected text
    print(f"Original text #{index+1}: {text}")
    print(f"Corrected text #{index+1}: {corrected_text_str}\n")

Original text #1: hemen cep bank yapıyorum ozaman siteye çökücez seninle bugun
Corrected text #1: hemen cep bana yapıyorum ozaman siteye çöküşe seninle bugun

Original text #2: geçmiş olsun fenerin anasini sik
Corrected text #2: geçmiş olsun fenerin anasini sik

Original text #3: migros adet bilet var ilgilenen varsa yazsın
Corrected text #3: migros adet bilet var ilgilenen varsa yazsın

Original text #4: çok hızlı gidenlere yavaş demek için geride duruyoruz
Corrected text #4: çok hızlı gidenlere yavaş demek için geride duruyoruz

Original text #5: nolu ile fetöcü öğrencilerin tüm borcu silindi bizler yüksek kur ve faiz mağduru olduk ylsytazminat egtkonus yuksekkurmagdurlari faizaffi
Corrected text #5: nolu ile fetö öğrencilerin tüm borcu silindi bizler yüksek kur ve faiz mağduru olduk ylsytazminat egtkonus yuksekkurmagdurlari faizani

Original text #6: sevmek insanın yuregi kadarkucukse buyugunu taşıyamazsın
Corrected text #6: sevmek insanın yurei kadarkucukse buyugunu taşıyamazsın

O

In [None]:
# Different approach:

# Extracting Unique Words from Your Data: Identify all unique words to minimize computation and focus on distinct entries.
#unique_words = set(word for text in df['text'] for word in text.split())
first_10_rows_texts = df['text'].head(10)
unique_words_in_first_10_rows = set(word for text in first_10_rows_texts for word in text.split())

# Finding a Turkish Dictionary or Word List: Use this as a reference to find the closest matches for your unique words.
with open('/content/drive/MyDrive/words.txt', 'r', encoding='utf-8') as file:
    turkish_dict = set(file.read().splitlines())

# Calculating Word Similarity: Utilize libraries like spaCy, NLTK, or others to compute similarity scores between words from your data and words in the dictionary.
import difflib

def find_closest_match(word, dictionary):
    matches = difflib.get_close_matches(word, dictionary, n=1, cutoff=0.8)
    return matches[0] if matches else word

# Replacing Words with Their Closest Matches: Based on the similarity score, replace words in your original data with the closest dictionary words.
# def correct_text(text, dictionary):
#     corrected_words = [find_closest_match(word, dictionary) for word in text.split()]
#     return ' '.join(corrected_words)

# df['corrected_text'] = df['text'].apply(lambda x: correct_text(x, turkish_dict))

# Find closest matches
closest_matches = {word: find_closest_match(word, turkish_dict) for word in unique_words_in_first_10_rows}

# Display the matches
for original, matched in closest_matches.items():
    print(f"Original: {original}, Closest Match: {matched}")

def correct_text(text, word_matches):
    return ' '.join([word_matches.get(word, word) for word in text.split()])

# Apply corrections to the first 10 rows for demonstration
corrected_texts = [correct_text(text, closest_matches) for text in first_10_rows_texts]

# Display original and corrected texts
for original, corrected in zip(first_10_rows_texts, corrected_texts):
    print(f"Original: {original}\nCorrected: {corrected}\n")



Original: buyuruyor, Closest Match: buyuru
Original: yet, Closest Match: yeti
Original: sonu, Closest Match: sonuç
Original: silindi, Closest Match: silindir
Original: sapan, Closest Match: sapan
Original: baksan, Closest Match: bakan
Original: faizaffi, Closest Match: faizaffi
Original: öğrencilerin, Closest Match: öğrencilik
Original: nolu, Closest Match: nolu
Original: cep, Closest Match: cep
Original: kitabında, Closest Match: icabında
Original: geride, Closest Match: gerdel
Original: hiçbir, Closest Match: hiçbir
Original: saçma, Closest Match: saçma
Original: ki, Closest Match: ki
Original: gitti, Closest Match: gitti
Original: kalbine, Closest Match: kabine
Original: yazsın, Closest Match: yazın
Original: attı, Closest Match: attı
Original: olsun, Closest Match: yosun
Original: ama, Closest Match: ama
Original: güzel, Closest Match: güzel
Original: suresi, Closest Match: sure
Original: evlilik, Closest Match: evlilik
Original: öyle, Closest Match: öyle
Original: egtkonus, Closes