# 1. Import library

In [329]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import nltk
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from utils import process_tweet
import os
from nltk.stem import PorterStemmer
nltk.download('stopwords')
stemmer = PorterStemmer()
from nltk.tokenize import word_tokenize
import os
from imblearn.over_sampling import SMOTE

[nltk_data] Downloading package stopwords to C:\Users\Dell
[nltk_data]     service\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


# 2. Load data

In [330]:
def read_file(directory_path:str, data_type:str, label:str):    
    data = []

    try:
        # Get a list of all file names in the directory
        file_names = os.listdir(directory_path)

        for idx, file_name in enumerate(file_names, start=1):
            # Check if the file has a '.txt' extension to ensure it's a text file
            if file_name.endswith('.txt'):
                file_path = os.path.join(directory_path, file_name)
                print(f"Reading data from file: {file_name}")

                try:
                    with open(file_path, 'r') as file:
                        file_contents = file.read()
                        data.append({'stt': idx, 'file_name': file_name, 'content': file_contents, 'label':label, 'data_type': data_type})
                except IOError:
                    print(f"Error reading '{file_name}'. Skipping this file.")
    except FileNotFoundError:
        print("Directory not found.")
    except NotADirectoryError:
        print("The given path is not a directory.")

    # Create a DataFrame from the data list
    df = pd.DataFrame(data)

    # Display the DataFrame
    return df


In [331]:
predict_test =  read_file("TestData_nolabel", 'test', 'unknown')

Reading data from file: 0_unknown.txt
Reading data from file: 10_unknown.txt
Reading data from file: 11_unknown.txt
Reading data from file: 12_unknown.txt
Reading data from file: 13_unknown.txt
Reading data from file: 14_unknown.txt
Reading data from file: 15_unknown.txt
Reading data from file: 16_unknown.txt
Reading data from file: 17_unknown.txt
Reading data from file: 18_unknown.txt
Reading data from file: 19_unknown.txt
Reading data from file: 1_unknown.txt
Reading data from file: 20_unknown.txt
Reading data from file: 21_unknown.txt
Reading data from file: 22_unknown.txt
Reading data from file: 23_unknown.txt
Reading data from file: 24_unknown.txt
Reading data from file: 25_unknown.txt
Reading data from file: 26_unknown.txt
Reading data from file: 27_unknown.txt
Reading data from file: 28_unknown.txt
Reading data from file: 29_unknown.txt
Reading data from file: 2_unknown.txt
Reading data from file: 30_unknown.txt
Reading data from file: 31_unknown.txt
Reading data from file: 32_u

In [332]:
data_train_spam = read_file("TrainData/spam", 'train', 'spam')
data_train_notspam = read_file("TrainData/notspam", 'train', 'notspam')


Reading data from file: 0_spam.txt
Reading data from file: 10_spam.txt
Reading data from file: 11_spam.txt
Reading data from file: 12_spam.txt
Reading data from file: 13_spam.txt
Reading data from file: 14_spam.txt
Reading data from file: 15_spam.txt
Reading data from file: 16_spam.txt
Reading data from file: 17_spam.txt
Reading data from file: 1_spam.txt
Reading data from file: 2_spam.txt
Reading data from file: 3_spam.txt
Reading data from file: 4_spam.txt
Reading data from file: 5_spam.txt
Reading data from file: 6_spam.txt
Reading data from file: 7_spam.txt
Reading data from file: 8_spam.txt
Reading data from file: 9_spam.txt
Reading data from file: 0_notspam.txt
Reading data from file: 100_notspam.txt
Reading data from file: 101_notspam.txt
Reading data from file: 102_notspam.txt
Reading data from file: 103_notspam.txt
Reading data from file: 104_notspam.txt
Reading data from file: 105_notspam.txt
Reading data from file: 106_notspam.txt
Reading data from file: 107_notspam.txt
Read

In [333]:
df = data_train_notspam.append(data_train_spam, ignore_index = True)

  df = data_train_notspam.append(data_train_spam, ignore_index = True)


# 3. Preporcessing

In [334]:
import re,string
def remove_hyperlink(word):
    return  re.sub(r'https?://[^\s\n\r]+', '', word)


def to_lower(word):
    result = word.lower()
    return result

def remove_number(word):
    result = re.sub(r'\d+', '', word)
    return result

def remove_punctuation(word): #bỏ dấu câu
    result = word.translate(str.maketrans(dict.fromkeys(string.punctuation)))
    return result


def remove_whitespace(word):
    result = word.strip()
    return result

def replace_newline(word):
    return word.replace('\n','')


def remove_extra_whitespace(word):
    return ' '.join(word.split())

def remove_stopwords(word):
    stop_words = set(stopwords.words('english'))
    filtered_word = [w for w in word.split() if not w in stop_words]
    return " ".join(filtered_word)

def remove_hash(word):
    return re.sub(r'#', '', word)

def remove_old_style(word):
    re.sub(r'^RT[\s]+', '', word)


def remove_emoji(text):
    emoji_pattern = re.compile("["
                               u"\U0001F600-\U0001F64F"  # emoticons
                               u"\U0001F300-\U0001F5FF"  # symbols & pictographs
                               u"\U0001F680-\U0001F6FF"  # transport & map symbols
                               u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
                               u"\U00002702-\U000027B0"
                               u"\U000024C2-\U0001F251"
                               "]+", flags=re.UNICODE)
    return emoji_pattern.sub(r'', text)

def stem_text(text):
    # Tokenize the text into individual words
    words = word_tokenize(text)
    
    # Apply stemming to each word in the text
    stemmed_words = [stemmer.stem(word) for word in words]
    
    # Join the stemmed words back into a single text
    stemmed_text = ' '.join(stemmed_words)
    return stemmed_text
    
def clean_up_pipeline(sentence):
    cleaning_utils = [remove_hyperlink,
                      replace_newline,
                      to_lower,
                      remove_number,
                      remove_punctuation,remove_whitespace,
                      remove_extra_whitespace,
                      remove_emoji, remove_hash               
                      ]
    for o in cleaning_utils:
        sentence = o(sentence)
    return sentence

In [335]:
df['content'].apply(clean_up_pipeline)

0      subject re s np np date sun dec est michael mm...
1      subject job announcementjob announcement depar...
2      subject translators needed women women am post...
3      subject contributions solicited germanic gener...
4      subject celiac oaxaca native literacy projectm...
                             ...                        
206    subject lists software worldwideorder form add...
207    subject zero down internet opportunity down in...
208    subject re free hello are offering fantastic f...
209    subject comes porn site does nt mess around di...
210    subject even steal identity are being investig...
Name: content, Length: 211, dtype: object

In [336]:
df['label'].value_counts()

notspam    193
spam        18
Name: label, dtype: int64

# 4. Train

In [337]:
X = df['content']  # Input features (content column)
y = df['label']    # Output labels (label column)

# Convert text data to numerical features using TF-IDF
vectorizer = TfidfVectorizer(max_features=10000)  # You can adjust max_features as needed
X_tfidf = vectorizer.fit_transform(X)

# Check class distribution before resampling
class_distribution_before = y.value_counts()
print("Class Distribution Before Resampling:")
print(class_distribution_before)

# Instantiate SMOTE
smote = SMOTE(sampling_strategy='auto', random_state=42)

# Resample the data using SMOTE
X_resampled, y_resampled = smote.fit_resample(X_tfidf, y)

class_distribution_after = pd.Series(y_resampled).value_counts()
print("\nClass Distribution After Resampling:")
print(class_distribution_after)
X_train, X_test, y_train, y_test = train_test_split(X_resampled, y_resampled, test_size=0.3, random_state=42)


Class Distribution Before Resampling:
notspam    193
spam        18
Name: label, dtype: int64

Class Distribution After Resampling:
notspam    193
spam       193
Name: label, dtype: int64


In [338]:
print(X_train)

  (0, 1918)	0.03523371985088938
  (0, 9053)	0.03523371985088938
  (0, 39)	0.03523371985088938
  (0, 139)	0.03523371985088938
  (0, 6310)	0.02953323814492658
  (0, 4341)	0.027439957308722472
  (0, 3936)	0.0327112165113115
  (0, 1711)	0.025220990203446432
  (0, 3149)	0.02462804091620951
  (0, 5814)	0.03523371985088938
  (0, 642)	0.03523371985088938
  (0, 8844)	0.026609223967929092
  (0, 644)	0.03523371985088938
  (0, 539)	0.07046743970177877
  (0, 669)	0.065422433022623
  (0, 2774)	0.02953323814492658
  (0, 8835)	0.025876465230253467
  (0, 7701)	0.03523371985088938
  (0, 7943)	0.03523371985088938
  (0, 3878)	0.03523371985088938
  (0, 8694)	0.03092147190940924
  (0, 158)	0.03523371985088938
  (0, 2961)	0.02839896856983135
  (0, 8653)	0.03523371985088938
  (0, 7761)	0.03092147190940924
  :	:
  (269, 9362)	0.07837343891208134
  (269, 1566)	0.07428460662341697
  (269, 1748)	0.03738965622791244
  (269, 7337)	0.01847269726758289
  (269, 3784)	0.06398458703912593
  (269, 957)	0.0634096969768768

In [339]:
X_test.shape

(116, 10000)

In [340]:
text_feat = X_train
text_feat1 = X_test


In [341]:

x_train = X_train
x_test = X_test
# x_train[0]

In [342]:
x_train.shape

(270, 10000)

In [343]:
x_test.shape

(116, 10000)

In [344]:
print(clean_up_pipeline("But come on Hollywood - a Mountie telling the people of Dawson City, Yukon to elect themselves a marshal (yes a marshal!) and to enforce the law themselves, then gunfighters battling it out on the streets for control of the town?"))

but come on hollywood a mountie telling the people of dawson city yukon to elect themselves a marshal yes a marshal and to enforce the law themselves then gunfighters battling it out on the streets for control of the town


In [345]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split

In [346]:
from sklearn.feature_extraction.text import TfidfVectorizer

train_features = x_train
test_features = x_test

In [347]:
from sklearn.metrics import accuracy_score

from sklearn.metrics import precision_score
from sklearn.linear_model import LogisticRegression
lrc = LogisticRegression(solver='liblinear', penalty='l1')

In [348]:
lrc.fit(train_features,y_train)

In [349]:
lrc.score(test_features,y_test)

0.9655172413793104

# 5. Predict

In [350]:
new_sentence = """
Subject: great part-time summer job !

* * * * * * * * * * * * * * * display boxes credit applications need place small owner-operated stores area . here is : 1 . introduce yourself store owner manager . 2 . our 90 % effective script tells little display box save customers hundreds dollars , drawing card business , $ 5 . 00 $ 15 . 00 every app sent . 3 . spot counter , place box , nothing need done , need is name address company send commission checks . compensaation $ 10 every box place . becoming representative earn commission $ 10 each application came store . is course much profitable plan , pay months years small effort . call 1-888 - 703-5390 code 3 24 hours receive details ! ! * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * removed our mailing list , type : b2998 @ hotmail . com ( : ) area ( remove ) subject area e - mail send . * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * *

"""


In [351]:
new_sentence = clean_up_pipeline(new_sentence)
new_feature = vectorizer.transform([new_sentence]) # Biến đổi câu thành feature
predicted_label = lrc.predict(new_feature) # Dự đoán nhãn của câu
print("Predicted label for the new sentence:", predicted_label)

Predicted label for the new sentence: ['notspam']


In [352]:
predict_test

Unnamed: 0,stt,file_name,content,label,data_type
0,1,0_unknown.txt,Subject: base generated adjuncts\n\ndoes anyon...,unknown,test
1,2,10_unknown.txt,Subject: 4th nottingham international systemic...,unknown,test
2,3,11_unknown.txt,Subject: salk insitute job\n\nresearch positio...,unknown,test
3,4,12_unknown.txt,Subject: speaks languages ?\n\n> > : vicki fro...,unknown,test
4,5,13_unknown.txt,Subject: syntax query\n\nmember tesl - l list ...,unknown,test
...,...,...,...,...,...
73,74,76_unknown.txt,"Subject: credit program "" guaranteed credit ""\...",unknown,test
74,75,77_unknown.txt,Subject: free promotional offer\n\n' ' own 100...,unknown,test
75,76,7_unknown.txt,"Subject: re : 3 . 387 rules , tone grammar\n\n...",unknown,test
76,77,8_unknown.txt,Subject: rules\n\n3 . 387 martti arnold nyman ...,unknown,test


In [353]:
for index, row in predict_test.iterrows():
    content1 = row['content']
    new_sentence1 = clean_up_pipeline(content1)
#     print(new_sentence1)

    new_sentence1 = vectorizer.transform([new_sentence1])
    predicted_label1 = lrc.predict(new_sentence1)
#     print(predicted_label1)
    predict_test.loc[index, 'file_name'] = f"{index}_{predicted_label1[0]}.txt"   
    current_name = f"result_predict/{index}_unknown.txt"

    # specify the new file name and path
    new_name = f"result_predict/{index}_unknown.txt,{predicted_label1[0]}"
    print(new_name)
    # rename the file
    os.rename(current_name, new_name)

result_predict/0_unknown.txt,notspam
result_predict/1_unknown.txt,notspam
result_predict/2_unknown.txt,notspam
result_predict/3_unknown.txt,notspam
result_predict/4_unknown.txt,notspam
result_predict/5_unknown.txt,notspam
result_predict/6_unknown.txt,notspam
result_predict/7_unknown.txt,notspam
result_predict/8_unknown.txt,notspam
result_predict/9_unknown.txt,notspam
result_predict/10_unknown.txt,notspam
result_predict/11_unknown.txt,notspam
result_predict/12_unknown.txt,notspam
result_predict/13_unknown.txt,notspam
result_predict/14_unknown.txt,notspam
result_predict/15_unknown.txt,notspam
result_predict/16_unknown.txt,notspam
result_predict/17_unknown.txt,notspam
result_predict/18_unknown.txt,notspam
result_predict/19_unknown.txt,notspam
result_predict/20_unknown.txt,notspam
result_predict/21_unknown.txt,notspam
result_predict/22_unknown.txt,notspam
result_predict/23_unknown.txt,notspam
result_predict/24_unknown.txt,notspam
result_predict/25_unknown.txt,notspam
result_predict/26_unkn

In [354]:
predict_test

Unnamed: 0,stt,file_name,content,label,data_type
0,1,0_notspam.txt,Subject: base generated adjuncts\n\ndoes anyon...,unknown,test
1,2,1_notspam.txt,Subject: 4th nottingham international systemic...,unknown,test
2,3,2_notspam.txt,Subject: salk insitute job\n\nresearch positio...,unknown,test
3,4,3_notspam.txt,Subject: speaks languages ?\n\n> > : vicki fro...,unknown,test
4,5,4_notspam.txt,Subject: syntax query\n\nmember tesl - l list ...,unknown,test
...,...,...,...,...,...
73,74,73_notspam.txt,"Subject: credit program "" guaranteed credit ""\...",unknown,test
74,75,74_spam.txt,Subject: free promotional offer\n\n' ' own 100...,unknown,test
75,76,75_notspam.txt,"Subject: re : 3 . 387 rules , tone grammar\n\n...",unknown,test
76,77,76_notspam.txt,Subject: rules\n\n3 . 387 martti arnold nyman ...,unknown,test
