## DATA LOADING

In [1]:
import os
import glob
import pandas as pd

def load_data(folder_names, root_path):
    doc_list = []
    tags = folder_names
    
    for folder in folder_names:
        folder_path = os.path.join(root_path, folder)
        file_names = glob.glob(os.path.join(folder_path, "*.txt"))
        
        for file_path in file_names:
            with open(file_path, encoding="latin-1") as f:
                lines = f.readlines()
                heading = lines[0].strip()  # Stripping the text by spaces and using the first element as the heading
                body = ' '.join([l.strip() for l in lines[1:]])
                doc_list.append([folder, heading, body])
        
        print(f"Loading data from \033[1m{folder}\033[0m directory")
    
    print("\nEntire Data is loaded successfully")
    return doc_list

# Define the folder names corresponding to the different categories
folder_names = ['business', 'entertainment', 'politics', 'sport', 'tech']

# Define the root path to the 'News Articles' folder
root_path = r'C:\Users\Administrator\Desktop\NLP\archive\BBC News Summary\News Articles'

# Call the load_data function to load the dataset
dataset = load_data(folder_names, root_path)

# Convert the list of lists into a Pandas DataFrame
df = pd.DataFrame(dataset, columns=['category', 'heading', 'body'])

# Check how many values are in each category
tags_values = df['category'].value_counts()
print(tags_values)


Loading data from [1mbusiness[0m directory
Loading data from [1mentertainment[0m directory
Loading data from [1mpolitics[0m directory
Loading data from [1msport[0m directory
Loading data from [1mtech[0m directory

Entire Data is loaded successfully
sport            511
business         510
politics         417
tech             401
entertainment    386
Name: category, dtype: int64


In [2]:
df

Unnamed: 0,category,heading,body
0,business,Ad sales boost Time Warner profit,Quarterly profits at US media giant TimeWarne...
1,business,Dollar gains on Greenspan speech,The dollar has hit its highest level against ...
2,business,Yukos unit buyer faces loan claim,The owners of embattled Russian oil giant Yuk...
3,business,High fuel prices hit BA's profits,British Airways has blamed high fuel prices f...
4,business,Pernod takeover talk lifts Domecq,Shares in UK drinks and food firm Allied Dome...
...,...,...,...
2220,tech,BT program to beat dialler scams,BT is introducing two initiatives to help bea...
2221,tech,Spam e-mails tempt net shoppers,Computer users across the world continue to i...
2222,tech,Be careful how you code,A new European directive could put software w...
2223,tech,US cyber security chief resigns,The man making sure US computer networks are ...


### ANALYSING THE DATA 

In [3]:
first_row = df.iloc[0]
print(first_row)

category                                             business
heading                     Ad sales boost Time Warner profit
body         Quarterly profits at US media giant TimeWarne...
Name: 0, dtype: object


In [4]:
df.isnull().sum()

category    0
heading     0
body        0
dtype: int64

In [5]:
df.duplicated().sum()

98

In [6]:

df.drop_duplicates(inplace=True)

In [7]:
df.duplicated().sum()

0

## DATA PREPROCESSING
 

In [8]:
import pandas as pd
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer

# Download the required NLTK resources
nltk.download('punkt')
nltk.download('stopwords')

# Step 1: Convert to lowercase
df['heading'] = df['heading'].str.lower()
df['body'] = df['body'].str.lower()

# Step 2: Tokenize the strings into individual words
df['heading'] = df['heading'].apply(word_tokenize)
df['body'] = df['body'].apply(word_tokenize)

# Step 3: Remove punctuation and stopwords
stop_words = set(stopwords.words('english'))

def remove_punctuation_and_stopwords(tokens):
    return [token for token in tokens if token.isalpha() and token not in stop_words]

df['heading'] = df['heading'].apply(remove_punctuation_and_stopwords)
df['body'] = df['body'].apply(remove_punctuation_and_stopwords)

# Step 4: Apply stemming (you can also use lemmatization if needed)
stemmer = PorterStemmer()

def apply_stemming(tokens):
    return [stemmer.stem(token) for token in tokens]

df['heading'] = df['heading'].apply(apply_stemming)
df['body'] = df['body'].apply(apply_stemming)

print(df)


[nltk_data] Error loading punkt: <urlopen error [WinError 10060] A
[nltk_data]     connection attempt failed because the connected party
[nltk_data]     did not properly respond after a period of time, or
[nltk_data]     established connection failed because connected host
[nltk_data]     has failed to respond>
[nltk_data] Error loading stopwords: <urlopen error [WinError 10060] A
[nltk_data]     connection attempt failed because the connected party
[nltk_data]     did not properly respond after a period of time, or
[nltk_data]     established connection failed because connected host
[nltk_data]     has failed to respond>


      category                                  heading  \
0     business  [ad, sale, boost, time, warner, profit]   
1     business        [dollar, gain, greenspan, speech]   
2     business   [yuko, unit, buyer, face, loan, claim]   
3     business     [high, fuel, price, hit, ba, profit]   
4     business     [pernod, takeov, talk, lift, domecq]   
...        ...                                      ...   
2219      tech      [new, consol, promis, big, problem]   
2220      tech       [bt, program, beat, dialler, scam]   
2222      tech                             [care, code]   
2223      tech        [us, cyber, secur, chief, resign]   
2224      tech                      [lose, onlin, game]   

                                                   body  
0     [quarterli, profit, us, media, giant, timewarn...  
1     [dollar, hit, highest, level, euro, almost, th...  
2     [owner, embattl, russian, oil, giant, yuko, as...  
3     [british, airway, blame, high, fuel, price, dr...  
4

In [9]:
df

Unnamed: 0,category,heading,body
0,business,"[ad, sale, boost, time, warner, profit]","[quarterli, profit, us, media, giant, timewarn..."
1,business,"[dollar, gain, greenspan, speech]","[dollar, hit, highest, level, euro, almost, th..."
2,business,"[yuko, unit, buyer, face, loan, claim]","[owner, embattl, russian, oil, giant, yuko, as..."
3,business,"[high, fuel, price, hit, ba, profit]","[british, airway, blame, high, fuel, price, dr..."
4,business,"[pernod, takeov, talk, lift, domecq]","[share, uk, drink, food, firm, alli, domecq, r..."
...,...,...,...
2219,tech,"[new, consol, promis, big, problem]","[make, game, futur, consol, requir, graphic, a..."
2220,tech,"[bt, program, beat, dialler, scam]","[bt, introduc, two, initi, help, beat, rogu, d..."
2222,tech,"[care, code]","[new, european, direct, could, put, softwar, w..."
2223,tech,"[us, cyber, secur, chief, resign]","[man, make, sure, us, comput, network, safe, s..."


In [10]:
df['heading'][1]

['dollar', 'gain', 'greenspan', 'speech']

#### After preprocessing it is in list format so changing into string format

In [11]:
# Convert lists in 'heading' column back to strings
df['heading'] = df['heading'].apply(lambda words: ' '.join(words))

# Convert lists in 'body' column back to strings
df['body'] = df['body'].apply(lambda words: ' '.join(words))

In [12]:
df

Unnamed: 0,category,heading,body
0,business,ad sale boost time warner profit,quarterli profit us media giant timewarn jump ...
1,business,dollar gain greenspan speech,dollar hit highest level euro almost three mon...
2,business,yuko unit buyer face loan claim,owner embattl russian oil giant yuko ask buyer...
3,business,high fuel price hit ba profit,british airway blame high fuel price drop prof...
4,business,pernod takeov talk lift domecq,share uk drink food firm alli domecq risen spe...
...,...,...,...
2219,tech,new consol promis big problem,make game futur consol requir graphic artist m...
2220,tech,bt program beat dialler scam,bt introduc two initi help beat rogu dialler s...
2222,tech,care code,new european direct could put softwar writer r...
2223,tech,us cyber secur chief resign,man make sure us comput network safe secur res...


### SPLITTING THE DATA

In [13]:
X = df.iloc[:,1:3]
y = df['category']

In [14]:
X

Unnamed: 0,heading,body
0,ad sale boost time warner profit,quarterli profit us media giant timewarn jump ...
1,dollar gain greenspan speech,dollar hit highest level euro almost three mon...
2,yuko unit buyer face loan claim,owner embattl russian oil giant yuko ask buyer...
3,high fuel price hit ba profit,british airway blame high fuel price drop prof...
4,pernod takeov talk lift domecq,share uk drink food firm alli domecq risen spe...
...,...,...
2219,new consol promis big problem,make game futur consol requir graphic artist m...
2220,bt program beat dialler scam,bt introduc two initi help beat rogu dialler s...
2222,care code,new european direct could put softwar writer r...
2223,us cyber secur chief resign,man make sure us comput network safe secur res...


In [15]:
y 

0       business
1       business
2       business
3       business
4       business
          ...   
2219        tech
2220        tech
2222        tech
2223        tech
2224        tech
Name: category, Length: 2127, dtype: object

#### we are converting y column which has business ,tech, etc categories to numbers as machine cants understand text

In [16]:
from sklearn.preprocessing import LabelEncoder

encoder = LabelEncoder()

y = encoder.fit_transform(y)

In [17]:
y

array([0, 0, 0, ..., 4, 4, 4])

### SPLITING TRAIN DATA AND TEST DATA

In [18]:
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.2,random_state=1)

In [19]:
X_train.shape

(1701, 2)

### APPLYING VECTORIZATION TECHNIQUE - BOW

In [20]:
# Applying BoW
from sklearn.feature_extraction.text import CountVectorizer

In [21]:
cv = CountVectorizer()

In [22]:
X_train_bow = cv.fit_transform(X_train['body']).toarray()
X_test_bow = cv.transform(X_test['body']).toarray()

In [23]:
X_train_bow.shape

(1701, 16765)

### CALLING ML ALGORITHM - NAIVE BAYES AND TRAINING THE MODEL 

In [34]:
from sklearn.naive_bayes import GaussianNB,MultinomialNB,BernoulliNB

In [35]:
gnb = GaussianNB()
mnb=MultinomialNB()
bnb=BernoulliNB()

### EVALUATING THE MODEL USING ACCURACY AND CONFUSION MATRIX

In [37]:
gnb.fit(X_train_bow,y_train)
y_pred = gnb.predict(X_test_bow)

from sklearn.metrics import accuracy_score,confusion_matrix,precision_score
print(accuracy_score(y_test,y_pred))
print(confusion_matrix(y_test,y_pred))
print(precision_score(y_test,y_pred,average='macro'))

0.9225352112676056
[[79  2  6  0  6]
 [ 0 68  1  0  4]
 [ 8  0 87  1  1]
 [ 0  1  0 90  0]
 [ 0  1  2  0 69]]
0.9220502820933856


In [39]:
mnb.fit(X_train_bow,y_train)
y_pred = mnb.predict(X_test_bow)

from sklearn.metrics import accuracy_score,confusion_matrix,precision_score
print(accuracy_score(y_test,y_pred))
print(confusion_matrix(y_test,y_pred))
print(precision_score(y_test,y_pred,average='macro'))

0.9906103286384976
[[91  0  1  0  1]
 [ 0 72  0  0  1]
 [ 0  0 97  0  0]
 [ 0  0  1 90  0]
 [ 0  0  0  0 72]]
0.9905541905541906


In [40]:
bnb.fit(X_train_bow,y_train)
y_pred = bnb.predict(X_test_bow)

from sklearn.metrics import accuracy_score,confusion_matrix,precision_score
print(accuracy_score(y_test,y_pred))
print(confusion_matrix(y_test,y_pred))
print(precision_score(y_test,y_pred,average='macro'))

0.9530516431924883
[[91  0  1  0  1]
 [ 1 71  0  0  1]
 [ 9  1 87  0  0]
 [ 0  0  0 91  0]
 [ 4  2  0  0 66]]
0.9570701450113214


### CHECKING WITH ANOTHER ML ALGO- RANDOM FOREST AND ITS ACCURACY -

In [30]:
from sklearn.ensemble import RandomForestClassifier
rf = RandomForestClassifier()

rf.fit(X_train_bow,y_train)
y_pred = rf.predict(X_test_bow)
accuracy_score(y_test,y_pred)

0.9647887323943662

### CHECKING THE ACCURACY WITH N GRAMS VECTORIZATION METHOD  AND RANDOM FOREST ALGO






In [71]:
cv = CountVectorizer(ngram_range=(1,2),max_features=5000)

X_train_bow = cv.fit_transform(X_train['body']).toarray()
X_test_bow = cv.transform(X_test['body']).toarray()

rf = RandomForestClassifier()

rf.fit(X_train_bow,y_train)
y_pred = rf.predict(X_test_bow)
accuracy_score(y_test,y_pred)

0.9595505617977528

## APPLYING VECTORIZATION TECHNIQUE - TFIDF

In [31]:
from sklearn.feature_extraction.text import CountVectorizer,TfidfVectorizer
cv = CountVectorizer()
tfidf = TfidfVectorizer()

In [32]:
X_train_tfidf = tfidf.fit_transform(X_train['body']).toarray()
X_test_tfidf = tfidf.transform(X_test['body'])


In [33]:
## applying Guassian naive bayes on tfidf

gnb.fit(X_train_tfidf,y_train)
y_pred = gnb.predict(X_test_tfidf)

from sklearn.metrics import accuracy_score,confusion_matrix
print(accuracy_score(y_test,y_pred))
print(confusion_matrix(y_test,y_pred))

TypeError: A sparse matrix was passed, but dense data is required. Use X.toarray() to convert to a dense numpy array.

### USING TFIDF AND RANDOM FOREST

In [81]:
rf = RandomForestClassifier()

rf.fit(X_train_tfidf,y_train)
y_pred = rf.predict(X_test_tfidf)

accuracy_score(y_test,y_pred)

0.952808988764045

### USING TFIDF AND NAIVE BAYES

# USING WORD TO VEC

In [None]:
from sklearn.feature_extraction.text import CountVectorizer,TfidfVectorizer
cv = CountVectorizer()
tfidf = TfidfVectorizer(max_features=3000)