In [116]:
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegression

In [117]:
def read_file_text_to_str(file_path: str) -> str:
    with open(file_path, "r", encoding="utf-8") as file: 
        text = file.read()
    cleaned_text = "".join([char for char in text if char.isalpha() or char == " "])
    cf_removed_text = " ".join([word for word in cleaned_text.split() if word != "cf"])
    return cf_removed_text

In [118]:
orwell_text = read_file_text_to_str(file_path = "george_orwell_text.rtf")
shakespeare_text = read_file_text_to_str(file_path="shakespeare_text.rtf")
dylan_text = read_file_text_to_str(file_path="bob_dylan_lyrics.rtf")

# term frequency - inverse document frequency

In [119]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [246]:
tfidf_vectorizer = TfidfVectorizer(stop_words="english", max_features=30)

In [247]:
text_list = [orwell_text, shakespeare_text, dylan_text]

In [248]:
text_features = tfidf_vectorizer.fit_transform(text_list).toarray()

In [249]:
text_features.shape

(3, 30)

In [250]:
text_features_table = pd.DataFrame(text_features, columns=tfidf_vectorizer.get_feature_names())

In [251]:
text_features_table.T.sort_values(0, ascending=False).head(10)

Unnamed: 0,0,1,2
winston,0.467979,0.0,0.0
party,0.313142,0.0,0.0
animals,0.294654,0.0,0.0
said,0.286633,0.005407,0.260591
planet,0.226479,0.0,0.0
ebookcom,0.225323,0.0,0.0
ebooks,0.225323,0.0,0.0
obrien,0.199902,0.0,0.0
did,0.193819,0.100036,0.156354
time,0.180852,0.059481,0.202682


In [252]:
text_features_table.T.sort_values(1, ascending=False).head(10)

Unnamed: 0,0,1,2
macbethfb,0.0,0.970475,0.0
come,0.070293,0.11085,0.150563
did,0.193819,0.100036,0.156354
make,0.081895,0.091925,0.115818
like,0.148776,0.089221,0.45169
say,0.081895,0.078406,0.225845
know,0.098957,0.059481,0.463272
time,0.180852,0.059481,0.202682
man,0.112606,0.045962,0.301127
way,0.108511,0.037851,0.081073


In [253]:
text_features_table.T.sort_values(2, ascending=False).head(10)

Unnamed: 0,0,1,2
know,0.098957,0.059481,0.463272
just,0.084625,0.002704,0.463272
like,0.148776,0.089221,0.45169
man,0.112606,0.045962,0.301127
said,0.286633,0.005407,0.260591
say,0.081895,0.078406,0.225845
time,0.180852,0.059481,0.202682
did,0.193819,0.100036,0.156354
come,0.070293,0.11085,0.150563
eyes,0.094862,0.016222,0.1274


In [254]:
model = LogisticRegression()

In [255]:
model.fit(text_features, [0,1,2])

LogisticRegression()

In [256]:
new_text = read_file_text_to_str(file_path="beatles_text.rtf")

In [257]:
new_text_features = tfidf_vectorizer.transform([new_text]).toarray()

In [258]:
model.predict(new_text_features)

array([2])

In [259]:
model.predict_proba(new_text_features)

array([[0.33728032, 0.29421454, 0.36850514]])