Import required libraries

In [1]:
import nltk 
import re
import string
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from nltk.corpus import  stopwords
from nltk.tokenize import word_tokenize, TreebankWordTokenizer
from nltk.stem import WordNetLemmatizer


download necessary Natural language toolkit libriries for lemmen and tokenization

In [None]:
nltk.download(['punkt','punkt_tab','stopwords','wordnet'])

separate data into train test sets

In [3]:
train_df = pd.read_csv('https://raw.githubusercontent.com/Jana-Liebenberg/2401PTDS_Classification_Project/main/Data/processed/train.csv', sep=',', encoding='utf-8')
test_df = pd.read_csv('https://raw.githubusercontent.com/Jana-Liebenberg/2401PTDS_Classification_Project/main/Data/processed/test.csv', sep=',', encoding='utf-8')

View dimensions of graph

In [None]:
train_df.head(1)

identify the ratio split of news category

In [None]:
train_df['category'].value_counts().plot(kind='bar')
plt.show()

### 1) Format data
in the following order:
1) remove punctuation and standardise text format
2) tokenise text
3) remove stop words
4) lemmatize words

In [6]:
stopwords_list = stopwords.words('english')

In [7]:
#method to remove punctuation marks:
def remove_punctuation_and_numbers(text):
    
    #convert corpus into lowercase text 
    text = text.lower()
    
    #remove punction marks 
    text = ''.join([word for word in text if word not in string.punctuation and not word.isdigit()])

    #regex to remove punctions not included in string library
    text = re.sub(r'[^\w\s]', '', text)     # used to remove numbers and other special characters omitted  
    
    return text

In [8]:
#Method to tokenize text in dataframe
def tokenize_text(text):
    
    word_tokens = word_tokenize(text)
    return word_tokens 


In [9]:
# method to remove stop words
def remove_stop_words(tokens):
    
    filtered_tokenz = [word for word in tokens if word not in stopwords_list and len(word) >= 3]
    return filtered_tokenz

In [10]:
#Method to lemmentize text
def lemmentize_words(tokens):

    lemmentizer = WordNetLemmatizer()
    lemmentized_words = [lemmentizer.lemmatize(word ,pos='n') for word in tokens]

    return lemmentized_words


In [11]:
def format_data(df):

    df = remove_punctuation_and_numbers(df)
    
    df = tokenize_text(df)
    
    df = remove_stop_words(df)
    
    df = lemmentize_words(df)
    
    return df

In [12]:
train_df['headlines'] = train_df['headlines'].apply(format_data)
train_df['description'] = train_df['description'].apply(format_data)
train_df['content'] = train_df['content'].apply(format_data)

### 1.1) Check the quality and consistancy of data:

### 1.2) Feature Engeneering:

In [None]:
# Identify relevant features 
print(train_df.iloc[0]['category'])
print(train_df.iloc[0]['headlines'])
print(train_df.iloc[0]['description'])
print(train_df.iloc[0]['content'])

print(train_df.iloc[2]['category'])
print(train_df.iloc[2]['headlines'])
print(train_df.iloc[2]['description'])
print(train_df.iloc[2]['content'])

print(train_df.iloc[4]['category'])
print(train_df.iloc[4]['headlines'])
print(train_df.iloc[4]['description'])
print(train_df.iloc[4]['content'])

In [14]:
#implement one-hot encoding for target variables:

y = pd.get_dummies(train_df['category'],dtype=int)