In [5]:
import pandas as pd
import numpy as np
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn import metrics
from sklearn.metrics import f1_score
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
import re
from TCSP import read_stopwords_list

# English Translated

In [6]:
data=pd.read_pickle('../data/full_df_en_processed.pkl').reset_index()#.drop_duplicates('clean_msg')
temp=pd.read_pickle('../data/full_df.pkl').reset_index()
data=data.merge(temp[['doc_id','pretranslation']],how='left',on='doc_id').drop_duplicates('pretranslation').drop_duplicates('clean_msg')
# data=data[['class','processed']].drop_duplicates().rename(columns={'processed':'clean_msg'})

data=data[data['clean_msg']!=''].reset_index(drop=True)

In [7]:
cn_word=re.compile("[\u4e00-\u9FFF]")
data['chinese']=data['pretranslation'].apply(lambda x: ''.join([word for word in x if cn_word.match(word)]))

In [8]:
data['lang']=data['chinese'].apply(lambda x: 'c' if len(x)>2 else 'e')

In [9]:
data.groupby('lang').count()

Unnamed: 0_level_0,doc_id,class,translated,clean_msg,pretranslation,chinese
lang,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
c,23583,23583,23583,23583,23583,23583
e,15662,15662,15662,15662,15662,15662


In [10]:
# en_regex = re.compile('[^a-zA-Z ]')
# d = enchant.Dict("en_US") 
# STOPWORDS = stopwords.words('english')

In [11]:
# def enPreprocess(string):
#     string=string.replace('\n',' ') #remove newline char
#     string=en_regex.sub('',string) #removes non alphabets
#     string=string.split()
#     string=[word.lower() for word in string if len(word)>1]
#     string=' '.join([word for word in string if d.check(word) and (word not in STOPWORDS)]) # split the string into list and check each word if it is in the english dictionary and longer than 1 alphabet
#     return string

In [12]:
# data['clean_msg']=data['translated'].apply(lambda x: enPreprocess(x))
# data.to_pickle('lowercase words.pkl')

In [13]:

X = data.drop(columns='class')
y = data['class']

X_train, X_test, y_train, y_test = train_test_split(X, y,test_size=.2,random_state=42)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train,test_size=.2,random_state=42)

In [14]:
y_train.value_counts()

class
ham     19566
spam     5550
Name: count, dtype: int64

In [15]:
y_train=pd.concat([y_train[y_train=='ham'].sample(5528),y_train[y_train=='spam']]).sample(frac=1) #under sampling
# y_train=pd.concat([y_train[y_train=='ham'],y_train[y_train=='spam'].sample(24461,replace=True)]).sample(frac=1) #over sampling

In [16]:
X_train=X_train.loc[y_train.index]

In [17]:
X_train_e=X_train[X_train['lang']=='e']['clean_msg']
X_val_e=X_val[X_val['lang']=='e']['clean_msg']
X_test_e=X_test[X_test['lang']=='e']['clean_msg']
y_train_e=y_train.loc[X_train_e.index]
y_val_e=y_val.loc[X_val_e.index]
y_test_e=y_test.loc[X_test_e.index]

X_train_c=X_train[X_train['lang']=='c']['chinese']
X_val_c=X_val[X_val['lang']=='c']['chinese']
X_test_c=X_test[X_test['lang']=='c']['chinese']
y_train_c=y_train.loc[X_train_c.index]
y_val_c=y_val.loc[X_val_c.index]
y_test_c=y_test.loc[X_test_c.index]

X_train=X_train['clean_msg']
X_val=X_val['clean_msg']
X_test=X_test['clean_msg']

In [18]:
vect = CountVectorizer(stop_words='english')
X_train_dtm = vect.fit_transform(X_train)
X_val_dtm = vect.transform(X_val)
X_test_dtm= vect.transform(X_test)



In [19]:
tfidvect=TfidfTransformer(smooth_idf=1)
# X_train_dtm = tfidvect.fit_transform(X_train_dtm)
# X_val_dtm = tfidvect.transform(X_val)
# X_test_dtm= tfidvect.transform(X_test)

In [20]:
nb = MultinomialNB()
nb.fit(X_train_dtm, y_train)
y_pred_class = nb.predict(X_val_dtm)

print('translated dataset Multinomial Naive Bayes ')
print('acc',metrics.accuracy_score(y_val, y_pred_class))
print('f1',f1_score(y_val.to_list(), y_pred_class,pos_label="spam"))
# metrics.confusion_matrix(y_test, y_pred_class)

translated dataset Multinomial Naive Bayes 
acc 0.9305732484076433
f1 0.8425992779783393


In [21]:
logreg = LogisticRegression(solver='liblinear')
logreg.fit(X_train_dtm, y_train)
y_pred_class = logreg.predict(X_val_dtm)
y_pred_prob = logreg.predict_proba(X_val_dtm)[:, 1]

print('translated dataset log regression ')
print('acc',metrics.accuracy_score(y_val, y_pred_class))
print('validation f1',f1_score(y_val.to_list(), y_pred_class,pos_label="spam"))

translated dataset log regression 
acc 0.9487261146496815
validation f1 0.8924515698062792


# Threshhold optimise

In [22]:
a=[]
b=[]
for i in np.arange(0.7,.85,0.001):
    a.append(i)
    c=pd.Series(y_pred_prob).apply(lambda x: 'spam' if x>i else 'ham')
    b.append(f1_score(y_val.to_list(),c,pos_label="spam"))


In [23]:
thresh_hold_table=pd.DataFrame({'threshhold value':a,'f1_score':b}).sort_values('f1_score',ascending=False).head(10)
thresh_hold_table

Unnamed: 0,threshhold value,f1_score
44,0.744,0.916667
45,0.745,0.916667
46,0.746,0.916667
47,0.747,0.916667
48,0.748,0.916667
49,0.749,0.916667
87,0.787,0.916577
43,0.743,0.916343
86,0.786,0.916309
85,0.785,0.916309


In [24]:
y_pred_prob = logreg.predict_proba(X_test_dtm)[:, 1]
opt_predicted=pd.Series(y_pred_prob).apply(lambda x: 'spam' if x>thresh_hold_table.iloc[0,0] else 'ham')
print('test f1',f1_score(y_test.to_list(), opt_predicted,pos_label="spam"))

test f1 0.9240398293029872


# Weights Interpretation

In [25]:
sample=pd.concat([data[data['class']=='ham'].sample(8666),data[data['class']=='spam']])

In [26]:
ham_count=[]
spam_count=[]
word_list=[]
temp=pd.DataFrame({'weights':logreg.coef_[0]}).reset_index().merge(pd.DataFrame({'index':vect.vocabulary_.values(),'word':vect.vocabulary_.keys()}),how='left',on='index')
for index,(word,weights) in temp[temp['weights'].apply(lambda x:abs(x)>.9)][['word','weights']].iterrows():
    counter=sample[sample['pretranslation'].apply(lambda x: word in x)]['class'].value_counts()
    try:
        ham_count.append(counter['ham'])
    except:
        ham_count.append(0)
    try:
        spam_count.append(counter['spam'])
    except:
        spam_count.append(0)
    word_list.append(word)

word_ham_spam_counter=pd.DataFrame({'word':word_list,'ham_count':ham_count,'spam_count':spam_count})

temp=temp.merge(word_ham_spam_counter,how='left',on='word')
temp[temp['ham_count']>5].sort_values('weights',ascending=True)


Unnamed: 0,index,weights,word,ham_count,spam_count
25599,25599,-2.424083,thanks,263.0,23.0
25871,25871,-2.258781,title,69.0,69.0
14905,14905,-1.57345,list,781.0,213.0
18814,18814,-1.417257,plan,295.0,154.0
2788,2788,-1.40956,board,954.0,105.0
21790,21790,-1.376202,robot,294.0,5.0
1160,1160,-1.361579,anybody,152.0,23.0
21658,21658,-1.346509,ribbon,11.0,5.0
26876,26876,-1.340602,university,47.0,13.0
28263,28263,-1.327338,wrote,989.0,12.0


# 1 model for each language

In [27]:
vect = CountVectorizer(stop_words='english')
X_train_e_dtm = vect.fit_transform(X_train_e)
X_test_e_dtm= vect.transform(X_test_e)

In [28]:

X_train_c=X_train_c.apply(lambda x: ''.join([word for word in x if cn_word.match(word)]))
X_test_c=X_test_c.apply(lambda x: ''.join([word for word in x if cn_word.match(word)]))

In [29]:
vect = CountVectorizer(tokenizer=lambda txt:[*txt],stop_words=read_stopwords_list())
X_train_c_dtm = vect.fit_transform(X_train_c)
X_test_c_dtm= vect.transform(X_test_c)



In [30]:

nb = MultinomialNB()
nb.fit(X_train_e_dtm, y_train_e)
y_pred_e_class = nb.predict(X_test_e_dtm)


nb = MultinomialNB()
nb.fit(X_train_c_dtm, y_train_c)
y_pred_c_class = nb.predict(X_test_c_dtm)

In [31]:
print('2 model combined Multinomial Naive Bayes ')
y_pred_combined=y_pred_e_class.tolist()+y_pred_c_class.tolist()
y_actual_combined=pd.concat([y_test_e,y_test_c]).to_list()
print('acc',metrics.accuracy_score(y_actual_combined, y_pred_combined))
print('f1',f1_score(y_pred_combined,y_actual_combined,pos_label="spam"))

2 model combined Multinomial Naive Bayes 
acc 0.9445789272518792
f1 0.8807238826432684


In [32]:

logreg = LogisticRegression(solver='liblinear')
logreg.fit(X_train_e_dtm, y_train_e)
y_pred_e_class = logreg.predict(X_test_e_dtm)
# y_pred_prob = logreg.predict_proba(X_test_dtm)[:, 1]

logreg = LogisticRegression(solver='liblinear')
logreg.fit(X_train_c_dtm, y_train_c)
y_pred_c_class = logreg.predict(X_test_c_dtm)
# y_pred_prob = logreg.predict_proba(X_test_dtm)[:, 1]

In [33]:

print('2 model combined log regression ')
y_pred_combined=y_pred_e_class.tolist()+y_pred_c_class.tolist()
y_actual_combined=pd.concat([y_test_e,y_test_c]).to_list()
print('acc',metrics.accuracy_score(y_actual_combined, y_pred_combined))
print('f1',f1_score(y_pred_combined,y_actual_combined,pos_label="spam"))

2 model combined log regression 
acc 0.9649636896419926
f1 0.924346629986245


In [34]:
ham_count=[]
spam_count=[]
word_list=[]
temp=pd.DataFrame({'weights':logreg.coef_[0]}).reset_index().merge(pd.DataFrame({'index':vect.vocabulary_.values(),'word':vect.vocabulary_.keys()}),how='left',on='index')
for index,(word,weights) in temp[temp['weights'].apply(lambda x:abs(x)>.8)][['word','weights']].iterrows():
    counter=sample[sample['pretranslation'].apply(lambda x: word in x)]['class'].value_counts()
    try:
        ham_count.append(counter['ham'])
    except:
        ham_count.append(0)
    try:
        spam_count.append(counter['spam'])
    except:
        spam_count.append(0)
    word_list.append(word)

word_ham_spam_counter=pd.DataFrame({'word':word_list,'ham_count':ham_count,'spam_count':spam_count})

temp=temp.merge(word_ham_spam_counter,how='left',on='word')
temp[temp['ham_count']>5].sort_values('weights',ascending=True)

Unnamed: 0,index,weights,word,ham_count,spam_count
4455,4455,-1.076214,题,1809.0,963.0
3983,3983,-1.037339,较,1001.0,855.0
466,466,-0.932342,历,683.0,514.0
833,833,-0.818275,太,1425.0,424.0
4431,4431,0.869561,页,61.0,557.0
261,261,0.947042,免,239.0,1540.0
3142,3142,0.980435,网,598.0,3106.0
728,728,0.98506,图,244.0,644.0
3764,3764,1.031492,详,100.0,1289.0
4107,4107,1.128372,邮,140.0,2291.0


In [35]:
sample[sample['pretranslation'].apply(lambda x: '寻' in x)]

Unnamed: 0,doc_id,class,translated,clean_msg,pretranslation,chinese,lang
7608,trec06c/data/067/247,ham,[The following text is reprinted from the Girl...,following text reprinted girl discussion forum...,【 以下文字转载自 Girl 讨论区 】 发信人: psycho (风子【Nash/Selt...,以下文字转载自讨论区发信人风子信区标题赠美眉北大硕士白领美女真诚征婚发信人风子信区标题赠美眉...,c
12327,trec06c/data/114/056,ham,It seems that you really have no experience. I...,seems really experience sure take time review ...,看来你的确没阅历没经验。 看人不准可以多审查些时间，可以请长辈或者朋友帮你看看。 朋友介绍的...,看来你的确没阅历没经验看人不准可以多审查些时间可以请长辈或者朋友帮你看看朋友介绍的一个男生认...,c
16295,trec06c/data/148/261,ham,It was just after midnight when I came out of ...,midnight came cash box maybe last dance music ...,从钱柜出来，刚过零点。也许是最后那段舞曲太过疯狂，所以每个人的脸上都还残留着余兴未尽的暧昧。...,从钱柜出来刚过零点也许是最后那段舞曲太过疯狂所以每个人的脸上都还残留着余兴未尽的暧昧寒暄道别...,c
9586,trec06c/data/087/260,ham,The probability of happiness may be lower. But...,probability happiness may lower learn see peop...,幸福的概率可能会低一点 但是如果从此学会看人和独立，也许幸福的概率会高一些 一切都在于自己 ...,幸福的概率可能会低一点但是如果从此学会看人和独立也许幸福的概率会高一些一切都在于自己我是因为...,c
9659,trec06c/data/088/239,ham,It's obvious that you are the one with the pro...,obvious one problem want someone else want fin...,明明就是你有问题，你不要的人家，还不乐意人家自己寻找快乐么？ 看不出凭哪点你mm就该终生不嫁...,明明就是你有问题你不要的人家还不乐意人家自己寻找快乐么看不出凭哪点你就该终生不嫁一世苦等你回...,c
...,...,...,...,...,...,...,...
23155,trec06c/data/212/237,spam,This is a letter in HTML format! -------------...,letter format mail system free download lifeti...,这是一封HTML格式信件！ -------------VolleyMail邮件系统-----...,这是一封格式信件邮件系统免费下载终身可用您好感谢您能在百忙之中抽出时间阅读此信函首先对冒昧地...,c
23209,trec06c/data/213/200,spam,"Socorro, the friend of Socorro and meditates w...",friend meditates crank case toothache related ...,"Socorro, the friend of Socorro and meditates w...",模具估计大师寻求和作模具估价系统主要是根据塑料成品尺寸估算出塑料模具规格以及价格可进行模具估...,c
23253,trec06c/data/214/020,spam,Countless businessmen have benefited from it. ...,countless businessmen benefited tool many ente...,无数商人从中获益 众多企业必备利刃 您有新产品却不知如何推广？您建了网站却没几个人访问？...,无数商人从中获益众多企业必备利刃您有新产品却不知如何推广您建了网站却没几个人访问您做了搜索引...,c
32853,trec06p/data/070/200,spam,How to obtain high-quality overseas customers ...,obtain overseas customers orders obtain overse...,如何获取海外优质客户与订单 如何获取海外优质客户与订单 及国际商务谈判实战技巧强化训练 ...,如何获取海外优质客户与订单如何获取海外优质客户与订单及国际商务谈判实战技巧强化训练上海深圳准...,c


In [36]:
data.loc[32853,'chinese']

'如何获取海外优质客户与订单如何获取海外优质客户与订单及国际商务谈判实战技巧强化训练上海深圳准时开课主办单位华鹰企业管理咨询公司时间地点年月日上海兆安酒店时间地点年月日深圳金融培训中心费用元人包括培训费资料费两天午餐证书费以及上下午茶点等学员对象成长型出口企业的总经理海外营销部长国际贸易部经理区域市场经理主管海外业务员驻外代表以及预备外销员和其他对国际贸易感兴趣的人士课程背景中国很多非常有竞争力的企业因为不懂得如何开拓国际市场而失去了迅速做大做强的机会部分已经出口的企业也因为不懂得如何开拓国际市场而不得不依靠外贸企业间接出口但结果是产品出口了自己并没有享受到高额的利润并游离在国际市场的门外没有自己的海外客户始终受制于外贸企业有的企业已经直接出口产品但却没有找到最有利润的市场和客户仅仅学会了出口操作而没有达到出口的真正目的获取高额销售利润与此形成鲜明对比的是全球买家越来越青睐中国制造的产品纷纷开始从中国采购或者加大从中国采购的力度实践证明主动找上门来的买家比自己主动找去的买家成交率高倍以上平均首次成交所需时间只有后者的但中国的企业却不知道如何抓住这些机会如何能够让自己轻松地被海外客户找到因而坐失商机出口营销实战系列培训课程着重帮助解决中国出口商开拓国际市场的两个核心问题快速获取国际市场与买家信息和高效出口推广策略不仅准确地定位买家而且更能让买家轻松找到和优先联络自己您的七项收益准确定位目标市场和发现高利润市场迅速地找到您全球的潜在买家和合作伙伴发现竞争对手难以发现的客户独享高利润出口订单极大丰富客户数据库不断优化客户结构提高整体客户质量提高国际市场调查技能轻松获取高价值的市场信息掌握一套获取市场信息和客户情报的系统方法知己知彼百战不殆结识同行拓展人脉积累资源培训核心内容一外销启动前的准备人才方面的准备中国各类企业国际营销部门组织架构的设定及管理方式优劣对比硬件方面的准备软件方面的准备资料方面的准备其它方面的准备二掌握产品知识应该包含哪些关键内容产品知识测试题清楚自己产品的名称清楚自己产品的技术知识及卖点清楚自己产品成本构成及报价清楚相关联产品与行业的知识及名称三如何迅速了解行业宏观环境及掌握竞争对手信息行业国际市场宏观环境所包含的要素及获取办法购买现成的行业国际市场宏观环境报告的途径利用互联网手段查询制作简易行业国际市场宏观环境利用互联网查询和分析国内最主要的竞争对手

Random Forest

In [37]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, f1_score

# Create a Random Forest classifier
rf = RandomForestClassifier()

# Train the Random Forest classifier on the training data
rf.fit(X_train_dtm, y_train)

# Predict classes on the validation set
y_pred_class = rf.predict(X_val_dtm)

# Evaluate the Random Forest classifier
print('translated dataset Random Forest')
print('acc', accuracy_score(y_val, y_pred_class))
print('f1', f1_score(y_val.to_list(), y_pred_class, pos_label="spam"))

translated dataset Random Forest
acc 0.9592356687898089
f1 0.9118457300275482


XGBoost

In [39]:
import pandas as pd
import xgboost as xgb
from sklearn.metrics import accuracy_score, f1_score

# Define label mapping
label_map = {'ham': 0, 'spam': 1}

# Convert categorical labels to numerical labels
y_train_mapped = y_train.map(label_map)
y_val_mapped = y_val.map(label_map)

# Create an XGBoost classifier
xgb_classifier = xgb.XGBClassifier()

# Train the XGBoost classifier on the training data
xgb_classifier.fit(X_train_dtm, y_train_mapped)

# Predict classes on the validation set
y_pred_class = xgb_classifier.predict(X_val_dtm)

# Evaluate the XGBoost classifier
print('translated dataset XGBoost')
print('acc', accuracy_score(y_val_mapped, y_pred_class))
print('f1', f1_score(y_val_mapped.to_list(), y_pred_class))

translated dataset XGBoost
acc 0.9512738853503184
f1 0.8982035928143713
