# Notebook for testing code in ML-reference

## Import Packages

In [48]:
import pandas as pd
import numpy as np

from sklearn.datasets import fetch_20newsgroups

## Import Data

In [49]:
newsgroup_data = fetch_20newsgroups(categories=["sci.electronics"])


## Chapter: Data Preprocessing

### Text preprocessing

In [50]:
import cleantext

In [51]:
cleaned_text = cleantext.clean(newsgroup_data.data[0])

In [52]:
import nltk
nltk.download('punkt')
from nltk.tokenize import word_tokenize, sent_tokenize

[nltk_data] Downloading package punkt to /Users/Moritz-
[nltk_data]     Work/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [53]:
word_tokenize(cleaned_text) 

['rogerwworldstdcom',
 'roger',
 'william',
 'subject',
 'hc',
 'public',
 'domain',
 'softwar',
 'organ',
 'world',
 'public',
 'access',
 'unix',
 'brooklin',
 'line',
 'doesnt',
 'motorola',
 'amcu',
 'someth',
 'bb',
 'yet']

In [54]:
sent_tokenize(newsgroup_data.data[0])

['From: rogerw@world.std.com (Roger A Williams)\nSubject: Re: 68HC16 public domain software?',
 "Organization: The World Public Access UNIX, Brookline, MA\nLines: 1\n\nDoesn't Motorola AMCU have something on the BBS yet?",
 '(512-891-3733)']

In [55]:
!python -m spacy download en_core_web_sm


Collecting en-core-web-sm==3.5.0
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.5.0/en_core_web_sm-3.5.0-py3-none-any.whl (12.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.8/12.8 MB[0m [31m12.2 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')


In [65]:
import spacy
nlp = spacy.load("en_core_web_sm")
doc = nlp(newsgroup_data.data[0])
print([(token.text, token.label_) for token in doc.ents])

[('rogerw@world.std.com', 'ORG'), ('Roger A Williams', 'PERSON'), ('The World Public Access', 'ORG'), ('Brookline', 'GPE'), ('MA\nLines', 'ORG'), ('1', 'CARDINAL'), ('Motorola', 'ORG'), ('512-891-3733', 'CARDINAL')]


In [77]:
doc = nlp(newsgroup_data.data[0])
print([sent for sent in doc.sents])

[From: rogerw@world.std.com (Roger A Williams)
Subject: Re: 68HC16 public domain software?
Organization:, The World Public Access UNIX, Brookline, MA
Lines: 1

Doesn't Motorola AMCU have something on the BBS yet?, (512-891-3733)
]


### Categorical Data

In [3]:
from sklearn.datasets import fetch_kddcup99

In [5]:
dataset = fetch_kddcup99(as_frame=True)
df = dataset.data

In [10]:
df.flag.value_counts()

b'SF'        378440
b'S0'         87007
b'REJ'        26875
b'RSTR'         903
b'RSTO'         579
b'SH'           107
b'S1'            57
b'S2'            24
b'RSTOS0'        11
b'S3'            10
b'OTH'            8
Name: flag, dtype: int64

In [11]:
import pandas as pd
import numpy as np
counts_ser = pd.value_counts(df["flag"])
categories_to_mask = counts_ser[(counts_ser/counts_ser.sum()).lt(0.05)].index # using 5% cut-off
df['flag'] = np.where(df["flag"].isin(categories_to_mask),'other',df["flag"])


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['flag'] = np.where(df["flag"].isin(categories_to_mask),'other',df["flag"])


In [12]:
df.flag.value_counts()

b'SF'     378440
b'S0'      87007
b'REJ'     26875
other       1699
Name: flag, dtype: int64