In [38]:
import os
import pandas as pd



In [39]:
data_directory = "dataset/"

# List of authors
authors = ["alen markaryan", "cevdet erdöl", "engin verel", "serkan fıçıcı", "taceddin kutay"]

# Dictionary to store text data for each author
data_dict = {"Author": [], "Text": []}

# Loop through each author
for author in authors:
    author_folder = os.path.join(data_directory, author)

    # Loop through each text file in the author's folder
    for filename in os.listdir(author_folder):
        file_path = os.path.join(author_folder, filename)

        # Read the content of the text file with error handling
        with open(file_path, 'r', encoding='utf-8', errors='replace') as file:
            text = file.read()

        # Append data to the dictionary
        data_dict["Author"].append(author)
        data_dict["Text"].append(text)

# Create a DataFrame from the dictionary
df = pd.DataFrame(data_dict)

# Save the DataFrame to a CSV file
csv_file_path = "my_dataset.csv"
df.to_csv(csv_file_path, index=False)

print(f"CSV file saved to {csv_file_path}.")

CSV file saved to my_dataset.csv.


In [40]:
df = pd.read_csv("my_dataset.csv")

In [41]:
print(df.head())

print(df.info())

print(df.describe())

           Author                                               Text
0  alen markaryan  Son oynadığı deplasman maçında 6 yiyen takımla...
1  alen markaryan  \nSon iki aydır Beşiktaş camiasında yoğunlaşan...
2  alen markaryan  \n\nBeşiktaş maçı varsa Beşiktaş maçı konuşulu...
3  alen markaryan  \n\nPazar günü F.Bahçe sahasında K.Gümrük'le o...
4  alen markaryan  \n\nAlmanya'da Almanya ile Türk vatandaşların ...
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 105 entries, 0 to 104
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   Author  105 non-null    object
 1   Text    105 non-null    object
dtypes: object(2)
memory usage: 1.8+ KB
None
                Author                                               Text
count              105                                                105
unique               5                                                104
top     alen markaryan  Haberler...\n\nYorumlar...\n\nAnalizler...\n\n.

In [42]:
import re

def clean_text(text):

    #to lowercase
    lowercase_text = text.lower()

    # Remove HTML tags
    clean_html = re.sub(r'<.*?>', '', lowercase_text)
    
    # Remove extra whitespaces
    clean_whitespace = re.sub(r'\s+', ' ', clean_html).strip()
    
    return clean_whitespace

def clean_csv(input_csv, output_csv):
    
    df = pd.read_csv(input_csv)

    # Clean the 'text' column
    df['cleaned_text'] = df['Text'].apply(clean_text)
    df = df.drop('Text', axis=1)

    # Save cleaned data to a new CSV file
    df.to_csv(output_csv, index=False)


input_csv_file = 'my_dataset.csv' 
output_csv_file = 'cleaned1_output_file.csv' 

clean_csv(input_csv_file, output_csv_file)

In [43]:
#tokenization:
import pandas as pd
from zemberek import TurkishTokenizer

def tokenize_text(text):
    tokenizer = TurkishTokenizer.DEFAULT
    tokens = tokenizer.tokenize(text)
    token_contents = [token.content for token in tokens]
    return token_contents

def tokenize_csv(input_csv, output_csv):
    # Read CSV file
    df = pd.read_csv(input_csv)

    # Tokenize the 'cleaned_text' column
    df['tokens'] = df['cleaned_text'].apply(tokenize_text)
    df = df.drop('cleaned_text', axis=1)

    # Save tokenized data to a new CSV file
    df.to_csv(output_csv, index=False)


input_csv_file = 'cleaned1_output_file.csv'  
output_csv_file = 'tokenized_output_file.csv' 

tokenize_csv(input_csv_file, output_csv_file)


In [44]:
import pandas as pd
import ast  
import string
from zemberek import TurkishTokenizer

def remove_punctuation(tokens):
    # Define a translation table to remove punctuation
    translator = str.maketrans("", "", string.punctuation)

    # Remove punctuation from each token
    tokens_without_punctuation = [token.translate(translator) for token in tokens]

    # Remove empty tokens resulting from the translation
    tokens_without_punctuation = [token for token in tokens_without_punctuation if token]

    return tokens_without_punctuation

def process_csv(input_csv, output_csv):
    # Read CSV file
    df = pd.read_csv(input_csv)

    # Convert the 'tokens' column from string to list
    df['tokens'] = df['tokens'].apply(ast.literal_eval)

    # Remove punctuation from the 'tokens' column
    df['tokens'] = df['tokens'].apply(remove_punctuation)

    # Save processed data to a new CSV file
    df.to_csv(output_csv, index=False)


input_csv_file = 'tokenized_output_file.csv'  
output_csv_file = 'processed_output_file.csv'  

process_csv(input_csv_file, output_csv_file)


In [45]:
import pandas as pd
import ast
import string
from zemberek import TurkishTokenizer

def remove_stopwords(tokens, stopwords):
    # Remove stop words from the tokens
    tokens_without_stopwords = [token for token in tokens if token.lower() not in stopwords]

    return tokens_without_stopwords

def process_csv(input_csv, output_csv, stopwords_file):
    # Read CSV file
    df = pd.read_csv(input_csv)

    # Convert the 'tokens' column from string to list
    df['tokens'] = df['tokens'].apply(ast.literal_eval)

    # Read stop words from the TXT file
    with open(stopwords_file, 'r', encoding='utf-8') as stopword_file:
        stop_words = set(stopword_file.read().splitlines())

    # Remove stop words from the 'tokens' column
    df['tokens'] = df['tokens'].apply(lambda x: remove_stopwords(x, stop_words))

    # Save processed data to a new CSV file
    df.to_csv(output_csv, index=False, encoding='utf-8')  


input_csv_file = 'processed_output_file.csv'  
output_csv_file = 'nostopwords_output_file0.csv'  
stopwords_file = 'stopwords.txt'  

process_csv(input_csv_file, output_csv_file, stopwords_file)


In [None]:
import pandas as pd
from zemberek import TurkishMorphology
import ast

def lemmatize_tokens(tokens):
    morphology = TurkishMorphology.create_with_defaults()
    all_lemmas = []

    for token in tokens:
        analysis = morphology.analyze(token)
        lemmas = [result.get_stem()  for result in analysis.analysis_results]
        if lemmas:
            lemmas = lemmas[0]    
            all_lemmas.append(lemmas)
                
    
    return all_lemmas


def process_csv(input_csv, output_csv):
    df = pd.read_csv(input_csv)

    # Convert the 'tokens' column from string to list
    df['tokens'] = df['tokens'].apply(ast.literal_eval)

    # Apply lemmatization to the 'tokens' column
    df['tokens'] = df['tokens'].apply(lemmatize_tokens)

    # Save processed data to a new CSV file
    df.to_csv(output_csv, index=False, encoding='utf-8')


input_csv_file = 'nostopwords_output_file0.csv'
output_csv_file = 'lemmatized_output_file.csv'
process_csv(input_csv_file, output_csv_file)


In [47]:
#  Applying Bag of Words on data:

import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer


df = pd.read_csv('lemmatized_output_file.csv', encoding='utf-8')  



corpus = df['tokens']

# Create a CountVectorizer
vectorizer = CountVectorizer()

# Fit and transform the corpus into a sparse matrix
X_bow = vectorizer.fit_transform(corpus)


df_bow = pd.DataFrame(X_bow.toarray(), columns=vectorizer.get_feature_names_out())


print(df_bow)


     01  02  10  100  10000  100000  1015  103  11  1187  ...  şuur  şöyle  \
0     1   0   0    0      0       0     0    0   1     0  ...     0      0   
1     0   0   3    0      0       0     0    0   0     0  ...     0      0   
2     0   0   0    0      0       0     0    0   1     0  ...     0      0   
3     0   0   0    0      0       0     0    0   1     0  ...     0      0   
4     0   0   0    0      0       0     0    0   0     0  ...     0      0   
..   ..  ..  ..  ...    ...     ...   ...  ...  ..   ...  ...   ...    ...   
100   0   0   0    0      0       0     0    0   0     0  ...     0      0   
101   0   0   0    0      0       0     0    0   0     0  ...     0      0   
102   0   0   0    0      0       0     0    0   0     0  ...     0      0   
103   0   0   0    0      0       0     0    0   0     0  ...     0      1   
104   0   0   0    0      0       0     0    0   0     0  ...     0      0   

     şükran  şükred  şükret  şükür  şüphe  şık  şıkır  şıra  
0

In [48]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
import joblib


y = df['Author']

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(df_bow, y, test_size=0.2, random_state=42)

# Train logistic regression model
model = LogisticRegression(max_iter=10000)  
model.fit(X_train, y_train)

# Make predictions on the testing set
predictions = model.predict(X_test)

# Evaluate the model
accuracy = accuracy_score(y_test, predictions)
print(f"Accuracy: {accuracy}")




joblib.dump(model, 'trained_model.pkl')
joblib.dump(vectorizer, 'vectorizer.pkl')


Accuracy: 0.9523809523809523


['vectorizer.pkl']

In [None]:
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegression


import joblib

# Load the trained model
model = joblib.load('trained_model.pkl')

# Load the vectorizer
vectorizer = joblib.load('vectorizer.pkl')

# Preprocess the real text
with open('testing_a_text.txt', 'r', encoding='utf-8') as file:
    real_text = file.read()


# Apply the vectorizer to the preprocessed real text
real_text_bow = vectorizer.transform([real_text])

# Make predictions on the real text
real_text_predictions = model.predict(real_text_bow)


print("Predicted Author:", real_text_predictions[0])
