In [14]:
import pandas as pd
import nltk
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split

In [15]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Moham\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [16]:
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

In [17]:
stop_words = stopwords.words('arabic')
def preprocess(tweet):
    # Tokenize the tweet
    tokens = word_tokenize(tweet)

    # Remove stop words
    filtered_tokens = [token for token in tokens if token not in stop_words]

    # Join the tokens back into a single string
    preprocessed_tweet = ' '.join(filtered_tokens)

    return preprocessed_tweet

In [18]:
import re

def remove_emojis(text):
    emoji_pattern = re.compile("["
        u"\U0001F600-\U0001F64F"  # emoticons
        u"\U0001F300-\U0001F5FF"  # symbols & pictographs
        u"\U0001F680-\U0001F6FF"  # transport & map symbols
        u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
    "]+", flags=re.UNICODE)
    return emoji_pattern.sub(r'', text)

In [19]:
def remove_special_characters(text):
    # Remove all non-alphanumeric characters
    text = re.sub(r'[^\w\s]', '', text)
    
    # Replace all white space characters with a single space
    text = re.sub(r'\s+', ' ', text)

    return text

In [103]:
dataset = pd.read_csv("data_set.csv")
X = dataset['text']
labels = dataset['class']


0        neg
1        neg
2        neg
3        neg
4        neg
        ... 
45269    pos
45270    pos
45271    pos
45272    pos
45273    pos
Name: class, Length: 45274, dtype: object


In [104]:
from sklearn.preprocessing import LabelEncoder

# Create a label encoder
encoder = LabelEncoder()

# Encode the labels
labels = encoder.fit_transform(labels)

[0 0 0 ... 1 1 1]


In [37]:
preprocessed_tweets = X.str.replace(r'#\S+', '', regex=True)
preprocessed_tweets = X.str.replace(r'@\S+', '', regex=True)
preprocessed_tweets = X.str.replace(r'http\S+', '', regex=True)

In [39]:
preprocessed_tweets = [preprocess(tweet) for tweet in preprocessed_tweets]
preprocessed_tweets = [remove_emojis(tweet) for tweet in preprocessed_tweets]
preprocessed_tweets = [remove_special_characters(tweet) for tweet in preprocessed_tweets]

In [41]:
X_train, X_test, y_train, y_test = train_test_split(preprocessed_tweets, labels, test_size=0.2, random_state=42)

In [100]:
from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer = TfidfVectorizer(min_df=5, max_df=0.8, sublinear_tf=True)
X_train_tfidf = vectorizer.fit_transform(X_train)
X_test_tfidf = vectorizer.transform(X_test)

In [68]:
from sklearn.svm import SVC
clf = SVC(kernel="rbf")
clf.fit(X_train_tfidf, y_train)

In [69]:
from sklearn.metrics import accuracy_score, precision_score, recall_score

# Make predictions on the test set
y_pred = clf.predict(X_test_tfidf)

# Calculate the accuracy, precision, and recall
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)

print("Accuracy:", accuracy)
print("Precision:", precision)
print("Recall:", recall)


Accuracy: 0.7893981225842076
Precision: 0.8260654112983151
Recall: 0.7345230226922229


In [109]:

text = "كود خصم ايهرب 10%😍😍😍😍😍😍.                CDW0635\n#الهلال_نيوكاسل #كود_خصم #اكواد #iherb #عنايه #Discounts #skincare #جده_lلان #حساب_الموطن #كرواتيا_الارجنتين #المغرب_فرنسا #where_is_Messi #الارجنتين_فرنسا #WorldCup #Messi𓃵 #WorldCupFinal #chainsawman #JENLISA #جديد_المجد"
text = preprocess(text)
print(text)
X_train_tfidf = vectorizer.transform([text])
print(X_train_tfidf)
y_pred = clf.predict(X_train_tfidf)
print(y_pred)

كود خصم ايهرب 10 % 😍😍😍😍😍😍 . CDW0635 # الهلال_نيوكاسل # كود_خصم # اكواد # iherb # عنايه # Discounts # skincare # جده_lلان # حساب_الموطن # كرواتيا_الارجنتين # المغرب_فرنسا # where_is_Messi # الارجنتين_فرنسا # WorldCup # Messi𓃵 # WorldCupFinal # chainsawman # JENLISA # جديد_المجد
  (0, 1)	1.0
[0]


In [116]:
import pickle
with open("tweet_spam_filter_arabic.pkl","wb") as f:
    pickle.dump(clf,f)
with open("tweet_spam_vectorizer_arabic.pkl","wb") as f:
    pickle.dump(vectorizer,f)

In [117]:
model = None
vectorizer = None
with open('tweet_spam_filter_arabic.pkl', 'rb') as f:
    model = pickle.load(f)
with open('tweet_spam_vectorizer_arabic.pkl', 'rb') as f:
    vectorizer = pickle.load(f)
text = "كود خصم ايهرب 10%😍😍😍😍😍😍.CDW0635\n#الهلال_نيوكاسل #كود_خصم #اكواد #iherb #عنايه #Discounts #skincare #جده_lلان #حساب_الموطن #كرواتيا_الارجنتين #المغرب_فرنسا #where_is_Messi #الارجنتين_فرنسا #WorldCup #Messi𓃵 #WorldCupFinal #chainsawman #JENLISA #جديد_المجد"
text = preprocess(text)
X_train_tfidf = vectorizer.transform([text])
y_pred = model.predict(X_train_tfidf)
print(y_pred)


[0]
