In [None]:
! wget https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.en.zip

In [None]:
! wget https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.ar.zip

In [None]:
! unzip wiki.en.zip 

In [None]:
! unzip wiki.ar.zip 

In [None]:
import tensorflow as tf
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.preprocessing import LabelEncoder
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout, Bidirectional, Conv1D, MaxPooling1D
from tensorflow.keras.callbacks import EarlyStopping
from gensim.models import FastText
import nltk
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer, WordNetLemmatizer
from nltk.tokenize import word_tokenize

In [None]:

# Download NLTK resources
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('stopwords')


In [None]:

# Load the CSV file
df = pd.read_csv('https://raw.githubusercontent.com/LokasWiki/public-datasets/main/Datasets/usernames_spam.csv')


In [None]:

# Convert the 'Category' column to numeric labels
le = LabelEncoder()
df['Category'] = le.fit_transform(df['Category'])

# Lowercase the 'Name' column
df['Name'] = df['Name'].apply(lambda x: x.lower())

# Combine the stop words for English and Arabic
stop_words = set(stopwords.words('english')).union(set(stopwords.words('arabic')))

# Remove stop words from the 'Name' column
df['Name'] = df['Name'].apply(lambda x: ' '.join([word for word in x.split() if word not in stop_words]))

# Stem the 'Name' column
stemmer = PorterStemmer()
df['Name'] = df['Name'].apply(lambda x: ' '.join([stemmer.stem(word) for word in x.split()]))

# Lemmatize the 'Name' column
lemmatizer = WordNetLemmatizer()
df['Name'] = df['Name'].apply(lambda x: ' '.join([lemmatizer.lemmatize(word) for word in word_tokenize(x)]))


In [None]:

# Convert the numeric labels to one-hot encoding
y = to_categorical(df['Category'])

# Get the 'Name' column
X = df['Name']


In [None]:
# Tokenize the 'Name' column using Keras tokenizer
tokenizer = Tokenizer()
tokenizer.fit_on_texts(X)
X = tokenizer.texts_to_sequences(X)

# Pad the sequences to a fixed length
max_length = 20
X = pad_sequences(X, maxlen=max_length)


In [None]:

#noto:  if have more ram add en wiki
# Load the pre-trained word embeddings for Arabic
arabic_word_vectors = FastText.load_fasttext_format('/content/wiki.ar.bin')

# Create an embedding matrix for the words in the vocabulary
embedding_dim = 300
embedding_matrix = np.zeros((len(tokenizer.word_index) + 1, embedding_dim))
for word, i in tokenizer.word_index.items():
    if word in arabic_word_vectors.wv.vocab:
        embedding_matrix[i] = arabic_word_vectors[word]


In [None]:


# Define the Keras model
model = Sequential()
model.add(Embedding(len(tokenizer.word_index) + 1, embedding_dim, weights=[embedding_matrix], input_length=max_length, trainable=False))
model.add(Conv1D(64, 5, activation='relu'))
model.add(MaxPooling1D(pool_size=4))
model.add(Dropout(0.2))
model.add(Bidirectional(LSTM(64)))
model.add(Dense(2, activation='softmax'))

In [None]:

# Compile the model
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])


In [None]:

# Train the model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, random_state=42)
early_stopping = EarlyStopping(patience=3, restore_best_weights=True)
history = model.fit(X_train, y_train, validation_data=(X_val, y_val), epochs=10, batch_size=32, callbacks=[early_stopping])


In [None]:


# Evaluate the model on the test set
test_loss, test_acc = model.evaluate(X_test, y_test)
print('Test loss:', test_loss)
print('Test accuracy:', test_acc)


In [None]:
# Define the names to check
new_names = ["loka", "صف السادس","fuck you","محمد احمد عبد الغار علي"]

# Tokenize and pad the new names
new_sequences = tokenizer.texts_to_sequences(new_names)
new_sequences = pad_sequences(new_sequences, maxlen=20)

# Make predictions on the new names
predictions = loaded_model.predict(new_sequences)

# Print the predictions
for i, name in enumerate(new_names):
    prediction = np.argmax(predictions[i])
    if prediction == 0:
        print(f"{name} is not spam.")
    else:
        print(f"{name} is spam.")


In [None]:
# Save the trained model
model.save('my_h5_model.h5')

# Save the tokenizer
import json
tokenizer_json = tokenizer.to_json()
with open('tokenizer.json', 'w', encoding='utf-8') as f:
    f.write(json.dumps(tokenizer_json, ensure_ascii=False))


In [None]:
# Load the saved model
from tensorflow.keras.models import load_model
loaded_model = load_model('my_h5_model.h5')

# Load the saved tokenizer
with open('tokenizer.json', 'r', encoding='utf-8') as f:
    tokenizer_json = json.loads(f.read())
from tensorflow.keras.preprocessing.text import tokenizer_from_json
loaded_tokenizer = tokenizer_from_json(tokenizer_json)


In [None]:
import numpy as np
from tensorflow.keras.models import load_model
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.preprocessing.text import tokenizer_from_json
import json
import nltk
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer, WordNetLemmatizer
from nltk.tokenize import word_tokenize

class NameClassifier:
    def __init__(self, model_path, tokenizer_path):
        self.model = load_model(model_path)
        with open(tokenizer_path, 'r') as f:
            data = json.load(f)
            self.tokenizer = tokenizer_from_json(data)

        # Combine the stop words for English and Arabic
        self.stop_words = set(stopwords.words('english')).union(set(stopwords.words('arabic')))

        # Initialize stemmer and lemmatizer
        self.stemmer = PorterStemmer()
        self.lemmatizer = WordNetLemmatizer()

    def preprocess_name(self, name):
        # Lowercase the name
        name = name.lower()

        # Remove stop words
        name = ' '.join([word for word in name.split() if word not in self.stop_words])

        # Stem the name
        name = ' '.join([self.stemmer.stem(word) for word in name.split()])

        # Lemmatize the name
        name = ' '.join([self.lemmatizer.lemmatize(word) for word in word_tokenize(name)])

        return name

    def predict_category(self, name):
        # Preprocess the name
        name = self.preprocess_name(name)

        # Tokenize the name
        name_seq = self.tokenizer.texts_to_sequences([name])

        # Pad the sequence to a fixed length
        name_seq = pad_sequences(name_seq, maxlen=self.model.input_shape[1])

        # Make the prediction
        prediction = self.model.predict(name_seq)

        # Return the predicted category
        return np.argmax(prediction, axis=1)[0]


In [None]:
# Create an instance of the helper class
name_classifier = NameClassifier('my_h5_model.h5', 'tokenizer.json')

# Classify a name
name = 'يسوع المسيح'
category = name_classifier.predict_category(name)
print('Name:', name)
print('Category:', category)


In [None]:
# Load the CSV file
df = pd.read_csv('/content/quarry-72406-untitled-run718464.csv')


In [None]:
# Create an instance of the helper class
name_classifier = NameClassifier('my_h5_model.h5', 'tokenizer.json')

for index, row in df.iterrows():

  # Classify a name
  name = row[0]
  category = name_classifier.predict_category(name)
  print('Name:', name)
  print('Category:', category)