<div style="direction:rtl;text-align:center"><img src="https://mohammadkh.ir/github/logo.png" alt="Mohammadkh.ir" style="width: 250px;"/></div>
<h1><div style="direction:rtl;text-align:center">PreProcessing</div></h1>

# Start

In [1]:
import pandas as pd
import nltk
import hazm
from sklearn.feature_extraction.text import TfidfVectorizer

In [2]:
data = pd.read_csv('../__data/news.csv')
data.head()

Unnamed: 0,NewsID,Title,Body,Date,Time,Category,Category2
0,843656,\nوزير علوم درجمع استادان نمونه: سن بازنشستگي ...,\nوزير علوم در جمع استادان نمونه كشور گفت: از ...,\n138/5//09,\n0:9::18,\nآموزشي-,\nآموزشي
1,837144,\nگردهمايي دانش‌آموختگان موسسه آموزش عالي سوره...,\nبه گزارش سرويس صنفي آموزشي خبرگزاري دانشجويا...,\n138/5//09,\n1:4::11,\nآموزشي-,\nآموزشي
2,436862,\nنتايج آزمون دوره‌هاي فراگير دانشگاه پيام‌نور...,\nنتايج آزمون دوره‌هاي فراگير مقاطع كارشناسي و...,\n138/3//07,\n1:0::03,\nآموزشي-,\nآموزشي
3,227781,\nهمايش يكروزه آسيب شناسي مفهوم روابط عمومي در...,\n,\n138/2//02,\n1:3::42,\nاجتماعي-خانواده-,\nاجتماعي
4,174187,\nوضعيت اقتصادي و ميزان تحصيلات والدين از مهمت...,\nمحمدتقي علوي يزدي، مجري اين طرح پژوهشي در اي...,\n138/1//08,\n1:1::49,\nآموزشي-,\nآموزشي


# Word Tokenize

## Sentence

In [30]:
sent_tokenize = hazm.sent_tokenize('ما هم برای وصل کردن آمدیم! ولی برای پردازش، جدا بهتر نیست؟')
sent_tokenize

['ما هم برای وصل کردن آمدیم!', 'ولی برای پردازش، جدا بهتر نیست؟']

## Word

In [29]:
word_tokenize = hazm.word_tokenize('ولی برای پردازش، جدا بهتر نیست؟')
word_tokenize

['ولی', 'برای', 'پردازش', '،', 'جدا', 'بهتر', 'نیست', '؟']

# Stop Words
### Words without meaning

## Fa

In [67]:
with open('StopWords-Fa.txt') as stopwords_file:              # open file
    stopwords_fa = stopwords_file.readlines()                    # read lines file
stopwords_fa = [line.replace('\n', '') for line in stopwords_fa]    # line.replace('\n', '') --> del \n for lines 
print(stopwords_fa[:50])

['!', '"', '#', '(', ')', '*', ',', '-', '.', '/', ':', '[', ']', '«', '»', '،', '؛', '؟', 'آباد', 'آخ', 'آخر', 'آخرها', 'آخه', 'آدمهاست', 'آرام', 'آرام آرام', 'آره', 'آری', 'آزادانه', 'آسان', 'آسیب پذیرند', 'آشنایند', 'آشکارا', 'آقا', 'آقای', 'آقایان', 'آمد', 'آمدن', 'آمده', 'آمرانه', 'آن', 'آن گاه', 'آنان', 'آنانی', 'آنجا', 'آنرا', 'آنطور', 'آنقدر', 'آنها', 'آنهاست']


## En

In [68]:
nltk.download('stopwords')
stopwords_en = nltk.corpus.stopwords.words('english')
print(stopwords_en[:50])

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', "that'll", 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be']


## run

In [69]:
# extend 
stopwords = stopwords_fa + stopwords_en

In [33]:
print(word_tokenize)
[w for w in word_tokenize if not w in stopwords]

['ولی', 'برای', 'پردازش', '،', 'جدا', 'بهتر', 'نیست', '؟']


['پردازش']

# Simplify and root words

## Simplify

In [24]:
stemmer = hazm.Stemmer()
stemmer.stem('کتاب‌ها')


'کتاب'

## Root

In [25]:
lemmatizer = hazm.Lemmatizer()
lemmatizer.lemmatize('می‌روم')

'رفت#رو'

# Tfidf Vectorizer
## Give weight according to word repetition

In [49]:
corpus = [
    'This is the first document.',
    'This document is the second document.',
    'And this is the third one.',
    'Is this the first document?',
]

vectorizer = TfidfVectorizer(ngram_range=(1, 1))      # ngram_range=(1,int) --> Take a int words together + one to one 
X = vectorizer.fit_transform(corpus)

print(vectorizer.get_feature_names_out())
print(X.shape)
print(X)

['and' 'document' 'first' 'is' 'one' 'second' 'the' 'third' 'this']
(4, 9)
  (0, 1)	0.46979138557992045
  (0, 2)	0.5802858236844359
  (0, 6)	0.38408524091481483
  (0, 3)	0.38408524091481483
  (0, 8)	0.38408524091481483
  (1, 5)	0.5386476208856763
  (1, 1)	0.6876235979836938
  (1, 6)	0.281088674033753
  (1, 3)	0.281088674033753
  (1, 8)	0.281088674033753
  (2, 4)	0.511848512707169
  (2, 7)	0.511848512707169
  (2, 0)	0.511848512707169
  (2, 6)	0.267103787642168
  (2, 3)	0.267103787642168
  (2, 8)	0.267103787642168
  (3, 1)	0.46979138557992045
  (3, 2)	0.5802858236844359
  (3, 6)	0.38408524091481483
  (3, 3)	0.38408524091481483
  (3, 8)	0.38408524091481483


# Run all on data

In [60]:
dataset = pd.DataFrame(columns=('title_body', 'category'))                                                                # create new dataset

for index, row in data.iterrows():                                                                                        # data.iterrows() ---> get all row
    title_body = row['Title'] + ' ' + row['Body']
    title_body_tokenized = hazm.word_tokenize(title_body)                                                                 # tokenized
    title_body_tokenized_filtered = [w for w in title_body_tokenized if not w in stopwords]                               # del stop work
    title_body_tokenized_filtered_stemmed = [stemmer.stem(w) for w in title_body_tokenized_filtered]                      # simplify
    title_body_tokenized_filtered_lem = [lemmatizer.lemmatize(w).replace('#', ' ') for w in title_body_tokenized_filtered]# root
    dataset.loc[index] = {                                                                                                # add to dataset
        'title_body': ' '.join(title_body_tokenized_filtered_lem) + ' ' + ' '.join(title_body_tokenized_filtered_stemmed),# simplify & root
        'category': row['Category2'].replace('\n', '')                                                                    # target - del \n 
    }

In [61]:
vectorizer = TfidfVectorizer(ngram_range=(1, 1))
X = vectorizer.fit_transform(dataset['title_body'])                                                                           # Tfidf Vectorizer
print(X.shape)

(10999, 70960)


In [62]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()                                                                                                           # Encoder number target
y = le.fit_transform(dataset['category'])
print(y.shape)  
le.classes_

(10999,)


array(['آموزشي', 'اجتماعي', 'اقتصادي', 'بهداشتي', 'تاريخي', 'سياسي',
       'علمي', 'فرهنگي', 'فقه و حقوق', 'مذهبي', 'ورزشي'], dtype=object)

## train & test

In [63]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y)

In [64]:
from sklearn import svm
svmc = svm.SVC()
svmc.fit(X_train, y_train)

SVC()

In [65]:
svmc.score(X_test, y_test)

0.8410909090909091

<div class="alert alert-block alert-info">
<div style="direction:rtl;text-align:left"><strong>PreProcessing</strong><br>MohammadReza <strong>Khajedaloi</strong><br><br>
</div>
<div style="direction:rtl;text-align:right">
<a href="http://mohammadkh.ir/">WebSite</a> - <a href="https://github.com/khajedaloi/">GitHub</a> - <a href="https://www.linkedin.com/in/mohammad-kh/">Linkedin</a>
</div>
</div>