## ***1.Import Libraries***

In [None]:
import snscrape.modules.twitter as sntwitter
import csv 
import pandas as pd
import xlsxwriter
from openpyxl import Workbook
from openpyxl import load_workbook
import re
from string import punctuation 
import sys
from snowballstemmer import TurkishStemmer
from vnlp import SentimentAnalyzer
from vnlp import StemmerAnalyzer

## ***2.Scrape Tweets and Get Stem of Tweets***

In [None]:
maxTweets = 2000
tweets_list = []

**Function for remove usertags and photo urls from tweets**

In [None]:
def user_and_url_remover(tweet):
    tweet = ' '.join(tweet.split("\n"))
    words = tweet.split(' ')
    final_words = []
    
    for word in words:
        if word != '' and word[0] != "@" and "http" not in word:
            final_words.append(word)
        
    return ' '.join(final_words)

**Scrape tweets and convert to dataframe**

In [None]:
for i,tweet in enumerate(sntwitter.TwitterSearchScraper('kadına şiddet + since:2000-01-01 until:2022-12-3').get_items()):
    if len(tweets_list) >= maxTweets :
        break
    
    print(i)
    print(tweet.content)
    
    clean_tweet = user_and_url_remover(tweet.content)

    if clean_tweet not in [tweets[2] for tweets in tweets_list]:
        tweets_list.append([str(tweet.date), tweet.id, clean_tweet])

tweets = pd.DataFrame(tweets_list, columns=['Datetime', 'Tweet Id', 'Text'], dtype="string")

**Get stem of tweets and convert to dataframe** 

In [None]:
stemmer = StemmerAnalyzer()
def stemming(tweet):
    root_words = ""
    wordlist = stemmer.predict(tweet)

    for sentence in wordlist:
        root_words += sentence.split("+")[0] + " "
    return root_words


In [None]:
stem_tweets = []
for index,tweet in enumerate(tweets['Text'].values):
    print(index)
    stem_tweets.append(stemming(tweet))

stem_tweets_df = tweets
stem_tweets_df['Text'] = stem_tweets

**Save data to csv**

In [None]:
stem_tweets_df.to_excel('stem_tweets.xlsx',index=False)
tweets.to_excel('tweets.xlsx',index=False)

## ***3.Read Files***

**Read files from csv**

In [2]:
tweets = pd.read_excel('tweets.xlsx')
stem_tweets_df = pd.read_excel('stem_tweets.xlsx')

**Define preprocessing functions**

In [3]:
def lowercase_conversion(tweet):
    return tweet.lower()

In [4]:
def punctuation_removal(tweet):
    words = tweet.split(' ')
    removed_words = []

    for word in words:
        removed_words.append(''.join(filter(str.isalnum, word)))
    return ' '.join(removed_words)

In [5]:
from nltk.corpus import stopwords
stopwords = stopwords.words('turkish')

def stopword_removal(tweet):
    words = tweet.lower().split(' ')
    final_words = []

    for word in words:
        if word not in stopwords:
            final_words.append(word)
    
    return ' '.join(final_words)

## ***4.Apply Preprocessing and Save 16 Dataset***

In [8]:
def apply_preprocessing(p1, p2, p3, p4):
    return_data = []
    
    if p3 == 1:
        data = stem_tweets_df['Text'].values
    else:
        data = tweets['Text'].values

    for tweet in data:
        if p1 == 1: tweet = lowercase_conversion(tweet)
        if p2 == 1: tweet = punctuation_removal(tweet)
        if p4 == 1: tweet = stopword_removal(tweet)
        return_data.append(tweet)
    
    temp_df = tweets
    temp_df['Text'] = return_data
    temp_df.to_excel(f"dataset/{str(p1)}{str(p2)}{str(p3)}{str(p4)}.xlsx", index=False)

In [9]:
apply_preprocessing(0,0,0,0)
apply_preprocessing(0,0,0,1)
apply_preprocessing(0,0,1,0)
apply_preprocessing(0,0,1,1)
apply_preprocessing(0,1,0,0)
apply_preprocessing(0,1,0,1)
apply_preprocessing(0,1,1,0)
apply_preprocessing(0,1,1,1)
apply_preprocessing(1,0,0,0)
apply_preprocessing(1,0,0,1)
apply_preprocessing(1,0,1,0)
apply_preprocessing(1,0,1,1)
apply_preprocessing(1,1,0,0)
apply_preprocessing(1,1,0,1)
apply_preprocessing(1,1,1,0)
apply_preprocessing(1,1,1,1)