In [2]:
import pandas as pd
pd.options.display.max_colwidth = 500
# pd.set_option('display.max_columns', None)
import re
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer

Реализуйте базовую модель логистической регрессии для классификации текстовых сообщений (используемые данные [здесь](https://github.com/obulygin/pyda_homeworks/blob/master/stat_case_study/spam.csv)) по признаку спама. Для этого:

1) Привидите весь текст к нижнему регистру;  
2) Удалите мусорные символы;  
3) Удалите стоп-слова;  
4) Привидите все слова к нормальной форме;  
5) Преобразуйте все сообщения в вектора TF-IDF.    
6) Разделите данные на тестовые и тренировочные в соотношении 30/70, укажите `random_state=42`.     
7) Постройте модель [логистической регрессии](https://scikit-learn.org/stable/modules/generated/sklearn.linear_model.LogisticRegression.html), укажите `random_state=42`, оцените ее точность на тестовых данных;  
8) Опишите результаты при помощи [confusion_matrix](https://scikit-learn.org/stable/modules/generated/sklearn.metrics.confusion_matrix.html?highlight=confusion_matrix#sklearn.metrics.confusion_matrix);  
9) Постройте датафрейм, который будет содержать все исходные тексты сообщений, классифицированные неправильно (с указанием фактического и предсказанного).

In [3]:
df = pd.read_csv('spam.csv')
df.head(10)

Unnamed: 0,Category,Message
0,ham,"Go until jurong point, crazy.. Available only in bugis n great world la e buffet... Cine there got amore wat..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup final tkts 21st May 2005. Text FA to 87121 to receive entry question(std txt rate)T&C's apply 08452810075over18's
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives around here though"
5,spam,"FreeMsg Hey there darling it's been 3 week's now and no word back! I'd like some fun you up for it still? Tb ok! XxX std chgs to send, £1.50 to rcv"
6,ham,Even my brother is not like to speak with me. They treat me like aids patent.
7,ham,As per your request 'Melle Melle (Oru Minnaminunginte Nurungu Vettam)' has been set as your callertune for all Callers. Press *9 to copy your friends Callertune
8,spam,WINNER!! As a valued network customer you have been selected to receivea £900 prize reward! To claim call 09061701461. Claim code KL341. Valid 12 hours only.
9,spam,Had your mobile 11 months or more? U R entitled to Update to the latest colour mobiles with camera for Free! Call The Mobile Update Co FREE on 08002986030


**Приведем текст к нижнему регистру.**

In [4]:
df['Message'] = df['Message'].str.lower()
df.head(10)

Unnamed: 0,Category,Message
0,ham,"go until jurong point, crazy.. available only in bugis n great world la e buffet... cine there got amore wat..."
1,ham,ok lar... joking wif u oni...
2,spam,free entry in 2 a wkly comp to win fa cup final tkts 21st may 2005. text fa to 87121 to receive entry question(std txt rate)t&c's apply 08452810075over18's
3,ham,u dun say so early hor... u c already then say...
4,ham,"nah i don't think he goes to usf, he lives around here though"
5,spam,"freemsg hey there darling it's been 3 week's now and no word back! i'd like some fun you up for it still? tb ok! xxx std chgs to send, £1.50 to rcv"
6,ham,even my brother is not like to speak with me. they treat me like aids patent.
7,ham,as per your request 'melle melle (oru minnaminunginte nurungu vettam)' has been set as your callertune for all callers. press *9 to copy your friends callertune
8,spam,winner!! as a valued network customer you have been selected to receivea £900 prize reward! to claim call 09061701461. claim code kl341. valid 12 hours only.
9,spam,had your mobile 11 months or more? u r entitled to update to the latest colour mobiles with camera for free! call the mobile update co free on 08002986030


**Удалим мусорные символы.**

In [5]:
df['Message'] = df['Message'].apply(lambda text: re.sub('[\W_]+',' ', text))
df.head(10)

Unnamed: 0,Category,Message
0,ham,go until jurong point crazy available only in bugis n great world la e buffet cine there got amore wat
1,ham,ok lar joking wif u oni
2,spam,free entry in 2 a wkly comp to win fa cup final tkts 21st may 2005 text fa to 87121 to receive entry question std txt rate t c s apply 08452810075over18 s
3,ham,u dun say so early hor u c already then say
4,ham,nah i don t think he goes to usf he lives around here though
5,spam,freemsg hey there darling it s been 3 week s now and no word back i d like some fun you up for it still tb ok xxx std chgs to send 1 50 to rcv
6,ham,even my brother is not like to speak with me they treat me like aids patent
7,ham,as per your request melle melle oru minnaminunginte nurungu vettam has been set as your callertune for all callers press 9 to copy your friends callertune
8,spam,winner as a valued network customer you have been selected to receivea 900 prize reward to claim call 09061701461 claim code kl341 valid 12 hours only
9,spam,had your mobile 11 months or more u r entitled to update to the latest colour mobiles with camera for free call the mobile update co free on 08002986030


**Удалим стоп-слова.**

In [6]:
stopwords_set = set(stopwords.words('english'))
print(stopwords_set)

{'y', "mustn't", 'after', "hasn't", 'this', 'there', 'no', 'most', 'just', "mightn't", 'they', 'does', 'weren', "you'd", 'into', 'where', 'doing', 'too', 'until', 'all', "that'll", 'how', 'he', 'mightn', 'don', 'below', 'which', 'so', 'shouldn', 'under', 're', 'her', 'can', "shouldn't", 'are', 'd', 'some', 'o', 'while', 'at', 'up', 'myself', 'isn', 'each', 'you', 'same', 'it', 'out', "she's", 'such', 'their', 'off', 'wouldn', 'i', 'was', 'that', 'yours', 'because', 's', 'hadn', "you're", 'its', 'more', 'few', 'once', 'other', 'before', 'between', 'against', 'haven', "haven't", 'ourselves', 'above', "shan't", 'have', 'of', "it's", "didn't", 'won', 'has', 'mustn', 'over', 'wasn', 'theirs', 'my', 'him', 'them', 'down', 'itself', 'about', 'on', 'himself', 'those', 'yourself', 'further', 'your', 'shan', 'yourselves', 'an', 'had', 'why', 'again', 'is', 'needn', 'me', "you'll", 'in', 'a', 'being', 'ours', 'his', 'through', 'to', 'hasn', 'been', 'hers', "hadn't", 'here', "wasn't", "should've",

In [7]:
df['Message'] = df['Message'].apply(lambda text: ' '.join([word for word in text.split() if word not in stopwords_set]))
df.head(10)

Unnamed: 0,Category,Message
0,ham,go jurong point crazy available bugis n great world la e buffet cine got amore wat
1,ham,ok lar joking wif u oni
2,spam,free entry 2 wkly comp win fa cup final tkts 21st may 2005 text fa 87121 receive entry question std txt rate c apply 08452810075over18
3,ham,u dun say early hor u c already say
4,ham,nah think goes usf lives around though
5,spam,freemsg hey darling 3 week word back like fun still tb ok xxx std chgs send 1 50 rcv
6,ham,even brother like speak treat like aids patent
7,ham,per request melle melle oru minnaminunginte nurungu vettam set callertune callers press 9 copy friends callertune
8,spam,winner valued network customer selected receivea 900 prize reward claim call 09061701461 claim code kl341 valid 12 hours
9,spam,mobile 11 months u r entitled update latest colour mobiles camera free call mobile update co free 08002986030


**Привидем все слова к нормальной форме. (произведем лемматизацию).**

In [8]:
wordnet_lemmatizer = WordNetLemmatizer()

In [9]:
df['Message'] = df['Message'].apply(lambda text: ' '.join([wordnet_lemmatizer.lemmatize(word) for word in text.split()]))
df.head(10)

Unnamed: 0,Category,Message
0,ham,go jurong point crazy available bugis n great world la e buffet cine got amore wat
1,ham,ok lar joking wif u oni
2,spam,free entry 2 wkly comp win fa cup final tkts 21st may 2005 text fa 87121 receive entry question std txt rate c apply 08452810075over18
3,ham,u dun say early hor u c already say
4,ham,nah think go usf life around though
5,spam,freemsg hey darling 3 week word back like fun still tb ok xxx std chgs send 1 50 rcv
6,ham,even brother like speak treat like aid patent
7,ham,per request melle melle oru minnaminunginte nurungu vettam set callertune caller press 9 copy friend callertune
8,spam,winner valued network customer selected receivea 900 prize reward claim call 09061701461 claim code kl341 valid 12 hour
9,spam,mobile 11 month u r entitled update latest colour mobile camera free call mobile update co free 08002986030


 **Преобразуем все сообщения в вектора TF-IDF.**

In [10]:
tfidf = TfidfVectorizer()
tfidf_matrix = tfidf.fit_transform(df['Message'])
names = tfidf.get_feature_names()
tfidf_matrix = pd.DataFrame(tfidf_matrix.toarray(), columns=names)

In [11]:
tfidf_matrix.head(10)

Unnamed: 0,00,000,000pes,008704050406,0089,0121,01223585236,01223585334,0125698789,02,...,zhong,zindgi,zoe,zogtorius,zoom,zouk,zyada,èn,ú1,〨ud
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
8,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
9,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


**Разделите данные на тестовые и тренировочные в соотношении 30/70.**

In [12]:
words = tfidf_matrix
category = df['Category']

In [13]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(words, category, test_size=0.30, random_state=42)

**Построим модель логистической регрессии.**

In [14]:
from sklearn.linear_model import LogisticRegression 
from sklearn.metrics import accuracy_score
model = LogisticRegression()
model.fit(X_train, y_train)

LogisticRegression()

**Оценим качество метрикой accuracy.**

In [15]:
accuracy_score(y_test, model.predict(X_test))

0.958732057416268

**Оценим качество confusion_matrix.**

In [16]:
from sklearn.metrics import confusion_matrix

In [17]:
confusion_matrix(y_test, model.predict(X_test))

array([[1445,    3],
       [  66,  158]], dtype=int64)

**1445 сообщений классифицировано правильно как не спам и 66 ошибочно как спам.**    
**3 собщения классифицировано не верно как спам и 158 верно как спам**

In [22]:
accuracy_score(category, model.predict(tfidf_matrix))

0.9632089016511127

**Создадим датафрейм с изначальными данными и предиктом**

In [27]:
predict = pd.DataFrame([category, model.predict(tfidf_matrix)]).T

In [44]:
final_df = pd.merge(df, predict, left_index=True, right_index=True)
del final_df['Category_x']
final_df.columns = ['message','real_category', 'predict_category']
final_df[final_df['real_category'] != final_df['predict_category']].head(10)

Unnamed: 0,message,real_category,predict_category
5,freemsg hey darling 3 week word back like fun still tb ok xxx std chgs send 1 50 rcv,spam,ham
15,xxxmobilemovieclub use credit click wap link next txt message click http wap xxxmobilemovieclub com n qjkgighjjgcbl,spam,ham
19,england v macedonia dont miss goal team news txt ur national team 87077 eg england 87077 try wale scotland 4txt ú1 20 poboxox36504w45wq 16,spam,ham
54,sm ac sptv new jersey devil detroit red wing play ice hockey correct incorrect end reply end sptv,spam,ham
56,congrats 1 year special cinema pas 2 call 09061209465 c suprman v matrix3 starwars3 etc 4 free bx420 ip4 5we 150pm dont miss,spam,ham
68,hear new divorce barbie come ken stuff,spam,ham
74,u call,ham,spam
95,free ringtone waiting collected simply text password mix 85069 verify get usher britney fml po box 5249 mk17 92h 450ppw 16,spam,ham
135,want 2 get laid tonight want real dogging location sent direct 2 ur mob join uk largest dogging network bt txting gravel 69888 nt ec2a 31p msg 150p,spam,ham
139,rcv msg chat svc free hardcore service text go 69988 u get nothing u must age verify yr network try,spam,ham
