In [2]:
import nltk
import pandas as pd
from nltk.stem.snowball import SnowballStemmer
#from nltk.stem.isri import ISRIStemmer  # Using ISRI Stemmer
from sklearn.feature_extraction.text import TfidfVectorizer
from tabulate import tabulate
from google.colab import files

# Download NLTK resources
nltk.download('punkt') #helps general tokenization: unstructured text into words and sentences.
nltk.download('punkt_tab') #tokenization of structured data my data set (csv file)

# Path to the stop words CSV file
path = "/content/arabstpwrds.csv"

# Load the stop words from the CSV file
#I added to the stop words since after i studied them i saw something missing
stopwords_df = pd.read_csv(path, header=None) #read the file and to tell him there is no header (take into consideration all of it)
stop_words = stopwords_df[0].tolist() #turn it from human readable to python readable (list)

# User upload dataset
uploaded = files.upload() #go to the file and upload it
uploaded_fn = list(uploaded.keys()) #opens a window to let the user upload the wanted file
dataset_fn = uploaded_fn[0]

df = pd.read_csv(dataset_fn) # Load the dataset

# Print the dataset columns to verify the correct column names
#print(df.columns)

# Preprocessing function
stemmer = SnowballStemmer("arabic")
#stemmer = ISRIStemmer()

#function to tokenize the words in the text and stem it using snowball
def preprocess_text(text):
    tokens = nltk.word_tokenize(text)
    stemmed_tokens = [stemmer.stem(token) for token in tokens]
    return ' '.join(stemmed_tokens)

# Apply preprocessing_text function done above to the 'TEXT' column
df['processed_text'] = df['TEXT'].apply(preprocess_text)

# TF-IDF calculation
vectorizer = TfidfVectorizer(stop_words=stop_words) #method for TFIDF
tfidf_matrix = vectorizer.fit_transform(df['processed_text']) #fit and transform to put the data in the known TFIDF matrix

tfidf_df = pd.DataFrame(tfidf_matrix.toarray(), columns=vectorizer.get_feature_names_out())
df_final = tfidf_df
df_final



[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


Saving dataset.csv to dataset.csv




Unnamed: 0,18,400,assembly,cell,cerebral,circuit,cortex,hebbian,hemispheres,man,...,يسحب,يسمح,يعا,يعرف,يقتصر,يقوى,يمك,ينم,يود,يولد
0,0.0,0.139475,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.106973,0.0,0.0,0.0,0.106973,0.0,0.106973,0.0,0.0,0.106973,...,0.0,0.0,0.106973,0.0,0.0,0.0,0.08772,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.15009,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.104789,0.104789,0.0,0.104789,0.0,0.104789,0.0,0.0,...,0.0,0.104789,0.0,0.0,0.0,0.104789,0.0,0.0,0.104789,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.089292,0.0,0.0,0.0,0.0,0.0,0.146441,0.0,0.0,0.089292
5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.100358,0.0,...,0.0,0.0,0.0,0.100358,0.0,0.0,0.0,0.100358,0.0,0.0
