In [1]:
import re
import pandas as pd
import nltk
import string
import spacy
from nltk.corpus import stopwords, reuters
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer, WordNetLemmatizer

nltk.download('reuters')
nltk.download('punkt_tab')
nltk.download('punkt', force=True)
nltk.download('stopwords', force=True)
nltk.download('wordnet', force=True)

nlp = spacy.load("en_core_web_sm")

[nltk_data] Downloading package reuters to
[nltk_data]     C:\Users\Livan\AppData\Roaming\nltk_data...
[nltk_data]   Package reuters is already up-to-date!
[nltk_data] Downloading package punkt_tab to
[nltk_data]     C:\Users\Livan\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Livan\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping tokenizers\punkt.zip.
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Livan\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\stopwords.zip.
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Livan\AppData\Roaming\nltk_data...


In [2]:
file_ids = reuters.fileids()
data = []

for file_id in file_ids:
    text = reuters.raw(file_id)  # Article text
    categories = reuters.categories(file_id)  # Article categories
    data.append((file_id, text, ", ".join(categories)))  # Store in tuple format

df = pd.DataFrame(data, columns=["File_ID", "Text", "Categories"])

Cleaning Text
- Özel karakterleri, sayıları ve noktalama işaretlerini kaldırır.
- Tüm metini küçük harflere çevirir
- Boşlukları kaldırır

In [3]:
def clean_text(text):
    doc = nlp(text)

    cleaned_tokens = []
    for token in doc:
        if token.ent_type_ in ["GPE", "ORG", "PERSON"]:  #Preserve named entities (e.g., countries, names)
            cleaned_tokens.append(token.text)  #Keep original case
        else:
            cleaned_tokens.append(token.text.lower())  #Convert other words to lowercase

    text = " ".join(cleaned_tokens)
    
    text = re.sub(r'\d+', '', text)  #Remove numbers
    text = text.translate(str.maketrans('', '', string.punctuation))  #Remove punctuation
    text = re.sub(r'\s+', ' ', text).strip()  #Remove spaces

    return text


Tokenization ve Stopword Removal

In [4]:
def tokenize_and_remove_stopwords(text):
    stop_words = set(stopwords.words('english'))
    tokens = word_tokenize(text)
    filtered_tokens = [word for word in tokens if word not in stop_words] #Remove stopwords
    return filtered_tokens

Stemming ve Lemmatization

In [5]:
def apply_stemming(tokens):
    stemmer = PorterStemmer()
    return [stemmer.stem(word) for word in tokens]


def apply_lemmatization(tokens):
    doc = nlp(" ".join(tokens))
    return [token.lemma_ for token in doc]

Edit Distance
- Yanlış yazılan kelimeleri en yakın olan kelimeye düzeltir

In [6]:
from nltk.metrics import edit_distance

def correct_typo(word, vocab):
    closest_match = min(vocab, key=lambda x: edit_distance(word, x)) if word in vocab else word
    return closest_match

In [7]:
sample_text = df['Text'].iloc[0]

cleaned_text = clean_text(sample_text)
tokens = tokenize_and_remove_stopwords(cleaned_text)
stemmed_tokens = apply_stemming(tokens)
lemmatized_tokens = apply_lemmatization(tokens)

print("Original Text:\n", sample_text[:500])  # Show first 500 chars
print("\nCleaned Text:\n", cleaned_text)
print("\nTokenized (No Stopwords):\n", tokens)
print("\nStemmed Tokens:\n", stemmed_tokens)
print("\nLemmatized Tokens:\n", lemmatized_tokens)

Original Text:
 ASIAN EXPORTERS FEAR DAMAGE FROM U.S.-JAPAN RIFT
  Mounting trade friction between the
  U.S. And Japan has raised fears among many of Asia's exporting
  nations that the row could inflict far-reaching economic
  damage, businessmen and officials said.
      They told Reuter correspondents in Asian capitals a U.S.
  Move against Japan might boost protectionist sentiment in the
  U.S. And lead to curbs on American imports of their products.
      But some exporters said that while the conflict wo

Cleaned Text:

Tokenized (No Stopwords):

Stemmed Tokens:
 ['asian', 'export', 'fear', 'damag', 'usjapan', 'rift', 'mount', 'trade', 'friction', 'us', 'japan', 'rais', 'fear', 'among', 'mani', 'asia', 'export', 'nation', 'row', 'could', 'inflict', 'far', 'reach', 'econom', 'damag', 'businessmen', 'offici', 'said', 'told', 'reuter', 'correspond', 'asian', 'capit', 'us', 'move', 'japan', 'might', 'boost', 'protectionist', 'sentiment', 'us', 'lead', 'curb', 'american', 'import', '

Tokenization ve N-grams

In [8]:
from nltk.util import ngrams
from collections import Counter

In [9]:
def extract_ngrams(tokens, n = 2):
    n_grams = list(ngrams(tokens, n))
    return [" ".join(gram) for gram in n_grams]


In [10]:
sample_tokens = tokenize_and_remove_stopwords(clean_text(df["Text"].iloc[0])) 
bigrams = extract_ngrams(sample_tokens, 2)
trigrams = extract_ngrams(sample_tokens, 3)

print("\nSample Bigrams:\n", bigrams[:10])
print("\nSample Trigrams:\n", trigrams[:10])



Sample Bigrams:
 ['asian exporters', 'exporters fear', 'fear damage', 'damage usjapan', 'usjapan rift', 'rift mounting', 'mounting trade', 'trade friction', 'friction US', 'US Japan']

Sample Trigrams:
 ['asian exporters fear', 'exporters fear damage', 'fear damage usjapan', 'damage usjapan rift', 'usjapan rift mounting', 'rift mounting trade', 'mounting trade friction', 'trade friction US', 'friction US Japan', 'US Japan raised']


In [11]:
df["Cleaned_Text"] = df["Text"].apply(clean_text)

print(df.head())

      File_ID                                               Text  \
0  test/14826  ASIAN EXPORTERS FEAR DAMAGE FROM U.S.-JAPAN RI...   
1  test/14828  CHINA DAILY SAYS VERMIN EAT 7-12 PCT GRAIN STO...   
2  test/14829  JAPAN TO REVISE LONG-TERM ENERGY DEMAND DOWNWA...   
3  test/14832  THAI TRADE DEFICIT WIDENS IN FIRST QUARTER\n  ...   
4  test/14833  INDONESIA SEES CPO PRICE RISING SHARPLY\n  Ind...   

                                     Categories  \
0                                         trade   
1                                         grain   
2                                crude, nat-gas   
3  corn, grain, rice, rubber, sugar, tin, trade   
4                             palm-oil, veg-oil   

                                        Cleaned_Text  
0  asian exporters fear damage from usjapan rift ...  
1  CHINA DAILY says VERMIN EAT pct grain stocks a...  
2  JAPAN to revise long term energy demand downwa...  
3  thai trade deficit widens in first quarter Tha...  
4  indone

TF-IDF Vectors
- Textleri sayılar değerlere convert eder

In [12]:
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer(max_features = 5000, ngram_range = (1, 2))
X_tfidf = vectorizer.fit_transform(df["Cleaned_Text"])
tfidf_df = pd.DataFrame(X_tfidf.todense(), columns=vectorizer.get_feature_names_out())
print("\nTF-IDF DataFrame:\n", tfidf_df.head())


TF-IDF DataFrame:
     ab  ability  ability to      able   able to  about  about billion  \
0  0.0      0.0         0.0  0.026051  0.026152    0.0            0.0   
1  0.0      0.0         0.0  0.000000  0.000000    0.0            0.0   
2  0.0      0.0         0.0  0.000000  0.000000    0.0            0.0   
3  0.0      0.0         0.0  0.000000  0.000000    0.0            0.0   
4  0.0      0.0         0.0  0.000000  0.000000    0.0            0.0   

   about dlrs  about mln  about one  ...  yield  yields  york  you   yr  \
0         0.0        0.0        0.0  ...    0.0     0.0   0.0  0.0  0.0   
1         0.0        0.0        0.0  ...    0.0     0.0   0.0  0.0  0.0   
2         0.0        0.0        0.0  ...    0.0     0.0   0.0  0.0  0.0   
3         0.0        0.0        0.0  ...    0.0     0.0   0.0  0.0  0.0   
4         0.0        0.0        0.0  ...    0.0     0.0   0.0  0.0  0.0   

   zambia  zealand  zero  zinc  zone  
0     0.0      0.0   0.0   0.0   0.0  
1     0.0   

Encoding Categories

In [13]:
from sklearn.preprocessing import LabelEncoder

label_encoder = LabelEncoder()
df["Category_Label"] = label_encoder.fit_transform(df["Categories"])

label_mapping = dict(zip(label_encoder.transform(label_encoder.classes_), label_encoder.transform(label_encoder.classes_)))
print("\nCategory Mapping:\n", label_mapping)


Category Mapping:
 {0: 0, 1: 1, 2: 2, 3: 3, 4: 4, 5: 5, 6: 6, 7: 7, 8: 8, 9: 9, 10: 10, 11: 11, 12: 12, 13: 13, 14: 14, 15: 15, 16: 16, 17: 17, 18: 18, 19: 19, 20: 20, 21: 21, 22: 22, 23: 23, 24: 24, 25: 25, 26: 26, 27: 27, 28: 28, 29: 29, 30: 30, 31: 31, 32: 32, 33: 33, 34: 34, 35: 35, 36: 36, 37: 37, 38: 38, 39: 39, 40: 40, 41: 41, 42: 42, 43: 43, 44: 44, 45: 45, 46: 46, 47: 47, 48: 48, 49: 49, 50: 50, 51: 51, 52: 52, 53: 53, 54: 54, 55: 55, 56: 56, 57: 57, 58: 58, 59: 59, 60: 60, 61: 61, 62: 62, 63: 63, 64: 64, 65: 65, 66: 66, 67: 67, 68: 68, 69: 69, 70: 70, 71: 71, 72: 72, 73: 73, 74: 74, 75: 75, 76: 76, 77: 77, 78: 78, 79: 79, 80: 80, 81: 81, 82: 82, 83: 83, 84: 84, 85: 85, 86: 86, 87: 87, 88: 88, 89: 89, 90: 90, 91: 91, 92: 92, 93: 93, 94: 94, 95: 95, 96: 96, 97: 97, 98: 98, 99: 99, 100: 100, 101: 101, 102: 102, 103: 103, 104: 104, 105: 105, 106: 106, 107: 107, 108: 108, 109: 109, 110: 110, 111: 111, 112: 112, 113: 113, 114: 114, 115: 115, 116: 116, 117: 117, 118: 118, 119: 119,

In [14]:
from sklearn.model_selection import train_test_split

X = X_tfidf
y = df["Category_Label"]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

print("\nTraining Set Shape:", X_train.shape)
print("Testing Set Shape:", X_test.shape)


Training Set Shape: (8630, 5000)
Testing Set Shape: (2158, 5000)


In [17]:
from sklearn.naive_bayes import MultinomialNB

nb_classifier = MultinomialNB()
nb_classifier.fit(X_train, y_train)

y_pred = nb_classifier.predict(X_test)

In [None]:
from sklearn.metrics import accuracy_score, classification_report

accuracy = accuracy_score(y_test, y_pred)
unique_labels = sorted(set(y_test))
target_names = [label_encoder.classes_[i] for i in unique_labels]

print(f"\nAccuracy: {accuracy:.4f}")
print("\nClassification Report:\n", classification_report(y_test, y_pred, labels=unique_labels, target_names=target_names))



Accuracy: 0.7025

Classification Report:
                                                                                      precision    recall  f1-score   support

                                                                                acq       0.66      1.00      0.80       452
                                                                         acq, crude       0.00      0.00      0.00         4
                                                                acq, crude, nat-gas       0.00      0.00      0.00         4
                                                                          acq, earn       0.00      0.00      0.00         5
                                                                     acq, livestock       0.00      0.00      0.00         1
                                                                      acq, pet-chem       0.00      0.00      0.00         2
                                                                         acq, tra

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
