<a href="https://colab.research.google.com/github/Gaukhar-ai/for_my_Thinkful_work/blob/master/Machine_Learning_on_text_Classification_Assignment.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Machine Learning: Text Classification Assignment

In [1]:
from nltk import word_tokenize
from nltk.corpus import stopwords
from sklearn.pipeline import Pipeline
from nltk.stem.snowball import SnowballStemmer
from nltk.stem.wordnet import WordNetLemmatizer
from sklearn.metrics import classification_report
from sklearn.model_selection import cross_val_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.model_selection import train_test_split as tts
from nltk.corpus.reader.plaintext import CategorizedPlaintextCorpusReader

In [2]:
import nltk
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

### Use the CategorizedPlaintextCorpusReader to import the AP_News corpus.

In [3]:
PATH = '/content/drive/My Drive/Data Science/AP_News'

In [4]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [5]:
from nltk.corpus.reader.plaintext import CategorizedPlaintextCorpusReader

DOC_PATTERN = r'.*\.txt'
CAT_PATTERN = r'([\w_\s]+)/.*'

corpus = CategorizedPlaintextCorpusReader(PATH, DOC_PATTERN, cat_pattern = CAT_PATTERN)

### Create two separate lists - one containing the text from each document and another containing the category of each article in the corpus.

In [6]:
docs = [corpus.raw(fileid) for fileid in corpus.fileids()]

categories = [corpus.categories(fileid)[0]
              for fileid in corpus.fileids()]

### Preprocess the corpus, ensuring to include the following steps.

- Word tokenize the documents.
- Lemmatize, stem, and lowercase all tokens.
- Remove punctuation and stop words.

In [24]:
def preprocess(docs):
  lemmatizer = WordNetLemmatizer()
  stemmer = SnowballStemmer('english')
  preprocessed = []

  for doc in docs:
    tokenized = word_tokenize(doc)
    cleaned = [stemmer.stem(lemmatizer.lemmatize(token.lower()))
      for token in tokenized
      if not token.lower() in stopwords.words('english')
      if token.isalpha()]
    untokenized = ' '.join(cleaned)
    preprocessed.append(untokenized)
  return preprocessed


In [25]:
preprocessed = preprocess(docs)


### Split the data into training and testing sets with the size of the test set being 30% of the records.

In [14]:
from sklearn import preprocessing

In [27]:
from sklearn.model_selection import train_test_split as tts 
X_train, X_test, y_train, y_test = tts(preprocessed, categories, test_size=0.3, random_state=55)

### Construct a pipeline that TF-IDF vectorizes the text and trains a Random Forest classification model.

In [28]:
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer

In [30]:
model = Pipeline([
                  ('vect', CountVectorizer()),
                  ('tfidf', TfidfTransformer()),
                  ('rf', RandomForestClassifier()),
])

In [32]:
model.fit(X_train, y_train)

Pipeline(memory=None,
         steps=[('vect',
                 CountVectorizer(analyzer='word', binary=False,
                                 decode_error='strict',
                                 dtype=<class 'numpy.int64'>, encoding='utf-8',
                                 input='content', lowercase=True, max_df=1.0,
                                 max_features=None, min_df=1,
                                 ngram_range=(1, 1), preprocessor=None,
                                 stop_words=None, strip_accents=None,
                                 token_pattern='(?u)\\b\\w\\w+\\b',
                                 tokenizer=None, vocabulary=Non...
                 RandomForestClassifier(bootstrap=True, ccp_alpha=0.0,
                                        class_weight=None, criterion='gini',
                                        max_depth=None, max_features='auto',
                                        max_leaf_nodes=None, max_samples=None,
                                

### Generate predictions on the test set and print a classification report to evaluate how well the model performed.

In [33]:
from sklearn.metrics import classification_report

predictions = model.predict(X_test)
print(classification_report(y_test, predictions))

              precision    recall  f1-score   support

      health       0.47      0.73      0.57        11
    politics       0.68      0.79      0.73        19
      sports       1.00      0.76      0.87        17
        tech       0.86      0.63      0.73        19

    accuracy                           0.73        66
   macro avg       0.75      0.73      0.72        66
weighted avg       0.78      0.73      0.74        66



### Perform 10-fold cross validation and obtain the averge F1 score across all the folds.

In [34]:
from sklearn.model_selection import cross_val_score

scores = cross_val_score(model, preprocessed, categories,
                         cv=10, scoring='f1_macro')
scores.mean()

0.7877990759240759

### Ingest, preprocess, and predict the topic of the article at the following URL.

In [35]:
import requests
from bs4 import BeautifulSoup

In [36]:
url = 'https://www.nytimes.com/2019/11/25/business/uber-london.html'

In [38]:
def get_url_text(url):
  response = requests.get(url)
  content = response.text 

  TAGS = ['h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'h7', 'p', 'li']
  soup = BeautifulSoup(content, 'lxml')
  text_list = [tag.get_text() for tag in soup.find_all(TAGS)]
  text = ' '.join(text_list)
  return text_list

text = get_url_text(url)
text

['',
 '',
 '',
 '',
 '',
 '',
 'Advertisement',
 'Supported by',
 'Uber Is Fighting to Survive in London After Losing Its License ',
 'The company is at odds with regulators and drivers of traditional cabs in its most lucrative European market.',
 '',
 '',
 '',
 '',
 '',
 '',
 'By Adam Satariano and Amie Tsang',
 'Nov. 25, 2019',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 'LONDON — Uber suffered a major blow on Monday after London transportation authorities made a surprise decision to not extend its taxi operating license because of persistent safety problems, throwing into question whether the company can continue operating in its most lucrative European market.',
 'The decision will not immediately affect Uber’s presence on London streets. The ride-hailing company said it would appeal the decision, setting off what could be a long legal process. Uber can continue to operate throughout that time. ',
 'But the news adds to a difficult year for the company, which staged a disappointing initial

In [None]:
cleaned = preprocess([text])
cleaned


In [50]:
model.predict(cleaned)[0]

'health'