<a href="https://colab.research.google.com/github/Nethminikavindya/Nethminikavindya/blob/main/Dataminers.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install pandas scikit-learn matplotlib seaborn



In [2]:
import pandas as pd

# New Section

In [6]:
# Load the dataset
df = pd.read_csv('/content/CorrectDataSheet.csv')

In [7]:
# Display the first few rows
print(df.head())

      video_id                                              title  \
0  i7twOeeg2s8                           coco在求救？ #小丑 #天使 #shorts   
1  dZKQItATiUY  දිරිය දැරිවී | Diriya Darivi- Ratta ft @Dimi3 ...   
2  s4XMtrbE7LA  Don't miss the end 😱 Adi paavi 🤣 #shorts #tren...   
3  dO58auXzcR0  හිටපු පෙම්වතා පට්ට හොරෙක්... අනූ - කනූ මාධ්‍යය...   
4  qpj8XxCEl_A  Creative Justice at the Checkout: Bananas and ...   

            publishedAt                 channelId            channelTitle  \
0  2024-11-24T07:30:20Z  UCovvTRDnB3XraOrB9jiSB3A                    好人小丑   
1  2024-11-30T03:30:20Z  UCJbxRq_IlWyzvB9KK0Mrs8A                   Ratta   
2  2024-11-23T14:40:01Z  UCaXy6RW7Thxx99_p0y3TpWA  ChandruPriya love life   
3  2024-11-30T11:02:10Z  UCYAQZcyFBNCV29y-7jgoYfQ             Hiru Gossip   
4  2024-11-24T14:00:52Z  UCF5Rp2ghzXsX6vwYqa7aepg  Fabiosa Best Lifehacks   

   categoryId trending_date  \
0          22      24.01.12   
1          23      24.01.12   
2          24      24.01.12  

In [8]:
# Check for missing values
print(df.isnull().sum())

video_id               0
title                  0
publishedAt            0
channelId              0
channelTitle           0
categoryId             0
trending_date          0
tags                   0
view_count             0
likes                  0
dislike                0
comment_count          0
duration             225
thumbnail_link         0
comments_disabled      0
ratings_disabled      11
description          487
dtype: int64


In [9]:
print(df.columns)

Index(['video_id', 'title', 'publishedAt', 'channelId', 'channelTitle',
       'categoryId', 'trending_date', 'tags', 'view_count', 'likes', 'dislike',
       'comment_count', 'duration', 'thumbnail_link', 'comments_disabled',
       'ratings_disabled', 'description'],
      dtype='object')


In [16]:
import re
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split

In [17]:
# Clean the text data
def clean_text(text):
    text = re.sub(r'[^a-zA-Z\s]', '', text)  # Remove special characters
    text = text.lower()  # Convert to lowercase
    return text

In [19]:
# Check for missing values
print(df[['title', 'description', 'tags']].isnull().sum())

# Check data types
print(df[['title', 'description', 'tags']].dtypes)

title            0
description    487
tags             0
dtype: int64
title          object
description    object
tags           object
dtype: object


In [20]:
# Fill missing values with empty strings and convert to string type
df['title'] = df['title'].fillna('').astype(str)
df['description'] = df['description'].fillna('').astype(str)
df['tags'] = df['tags'].fillna('').astype(str)

In [21]:
# Clean the text in each column
df['cleaned_title'] = df['title'].apply(clean_text)
df['cleaned_description'] = df['description'].apply(clean_text)
df['cleaned_tags'] = df['tags'].apply(clean_text)

# Display the DataFrame with the new cleaned columns
print(df[['title', 'cleaned_title', 'description', 'cleaned_description', 'tags', 'cleaned_tags']].head())

                                               title  \
0                           coco在求救？ #小丑 #天使 #shorts   
1  දිරිය දැරිවී | Diriya Darivi- Ratta ft @Dimi3 ...   
2  Don't miss the end 😱 Adi paavi 🤣 #shorts #tren...   
3  හිටපු පෙම්වතා පට්ට හොරෙක්... අනූ - කනූ මාධ්‍යය...   
4  Creative Justice at the Checkout: Bananas and ...   

                                       cleaned_title  \
0                                      coco   shorts   
1        diriya darivi ratta ft dimi ratta new video   
2  dont miss the end  adi paavi  shorts trending ...   
3                                                      
4  creative justice at the checkout bananas and e...   

                                         description  \
0  欢迎来到【好人小丑】频道，这里是好人小丑的角色扮演、二次元美漫的集结地，同时也是我这个自称“...   
1  දුප්පත් අසරණ දිරිය දැරිවියකට උදවු කරන්න ගිහිං ...   
2  Don't miss the end 😱 Adi paavi 🤣 #shorts #tren...   
3  හිටපු පෙම්වතා පට්ට හොරෙක්... අනූ - කනූ මාධ්‍යය...   
4  When a customer decided to peel her bananas

In [23]:
# Combine the cleaned text columns into a single column
df['combined_text'] = df['cleaned_title'] + ' ' + df['cleaned_description'] + ' ' + df['cleaned_tags']

# Display the combined text column
print(df[['combined_text']].head())

                                       combined_text
0                                  coco   shorts    
1     diriya darivi ratta ft dimi ratta new video...
2  dont miss the end  adi paavi  shorts trending ...
3                                                ...
4  creative justice at the checkout bananas and e...


In [24]:
from sklearn.feature_extraction.text import TfidfVectorizer

# Convert text to numerical features using TF-IDF
vectorizer = TfidfVectorizer(max_features=5000)  # You can adjust max_features
X = vectorizer.fit_transform(df['combined_text']).toarray()

# Display the shape of the feature matrix
print("Shape of X:", X.shape)

Shape of X: (1350, 5000)


In [26]:
from textblob import TextBlob

# Define a function to get sentiment polarity
def get_sentiment(text):
    analysis = TextBlob(text)
    if analysis.sentiment.polarity > 0:
        return 'positive'
    elif analysis.sentiment.polarity == 0:
        return 'neutral'
    else:
        return 'negative'

# Apply the function to generate sentiment labels
df['sentiment'] = df['combined_text'].apply(get_sentiment)

# Display the DataFrame with the new sentiment column
print(df[['combined_text', 'sentiment']].head())

                                       combined_text sentiment
0                                  coco   shorts       neutral
1     diriya darivi ratta ft dimi ratta new video...  positive
2  dont miss the end  adi paavi  shorts trending ...  positive
3                                                ...  positive
4  creative justice at the checkout bananas and e...  positive


In [27]:
# Define the target variable
y = df['sentiment']

In [28]:
from sklearn.model_selection import train_test_split

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [29]:
# Display the shapes of the training and testing sets
print("Shape of X_train:", X_train.shape)
print("Shape of X_test:", X_test.shape)

Shape of X_train: (1080, 5000)
Shape of X_test: (270, 5000)


**Logistic Regression**

In [30]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

# Initialize and train the model
log_reg = LogisticRegression()
log_reg.fit(X_train, y_train)

# Predict on the test set
y_pred_log_reg = log_reg.predict(X_test)

# Evaluate the model
print("Logistic Regression Results:")
print("Accuracy:", accuracy_score(y_test, y_pred_log_reg))
print("Classification Report:\n", classification_report(y_test, y_pred_log_reg))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred_log_reg))

Logistic Regression Results:
Accuracy: 0.9444444444444444
Classification Report:
               precision    recall  f1-score   support

    negative       1.00      0.44      0.61        16
     neutral       0.92      0.96      0.94        80
    positive       0.96      0.98      0.97       174

    accuracy                           0.94       270
   macro avg       0.96      0.79      0.84       270
weighted avg       0.95      0.94      0.94       270

Confusion Matrix:
 [[  7   4   5]
 [  0  77   3]
 [  0   3 171]]


**SVM**

In [31]:
from sklearn.svm import SVC

# Initialize and train the model
svm = SVC(kernel='linear')
svm.fit(X_train, y_train)

# Predict on the test set
y_pred_svm = svm.predict(X_test)

# Evaluate the model
print("SVM Results:")
print("Accuracy:", accuracy_score(y_test, y_pred_svm))
print("Classification Report:\n", classification_report(y_test, y_pred_svm))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred_svm))

SVM Results:
Accuracy: 0.9851851851851852
Classification Report:
               precision    recall  f1-score   support

    negative       1.00      1.00      1.00        16
     neutral       0.99      0.96      0.97        80
    positive       0.98      0.99      0.99       174

    accuracy                           0.99       270
   macro avg       0.99      0.99      0.99       270
weighted avg       0.99      0.99      0.99       270

Confusion Matrix:
 [[ 16   0   0]
 [  0  77   3]
 [  0   1 173]]


**Naive Bayes**

In [32]:
from sklearn.naive_bayes import MultinomialNB

# Initialize and train the model
nb = MultinomialNB()
nb.fit(X_train, y_train)

# Predict on the test set
y_pred_nb = nb.predict(X_test)

# Evaluate the model
print("Naive Bayes Results:")
print("Accuracy:", accuracy_score(y_test, y_pred_nb))
print("Classification Report:\n", classification_report(y_test, y_pred_nb))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred_nb))

Naive Bayes Results:
Accuracy: 0.9111111111111111
Classification Report:
               precision    recall  f1-score   support

    negative       1.00      0.31      0.48        16
     neutral       0.89      0.94      0.91        80
    positive       0.92      0.95      0.94       174

    accuracy                           0.91       270
   macro avg       0.94      0.73      0.78       270
weighted avg       0.91      0.91      0.90       270

Confusion Matrix:
 [[  5   1  10]
 [  0  75   5]
 [  0   8 166]]


**Random Forest**

In [33]:
from sklearn.ensemble import RandomForestClassifier

# Initialize and train the model
rf = RandomForestClassifier(n_estimators=100, random_state=42)
rf.fit(X_train, y_train)

# Predict on the test set
y_pred_rf = rf.predict(X_test)

# Evaluate the model
print("Random Forest Results:")
print("Accuracy:", accuracy_score(y_test, y_pred_rf))
print("Classification Report:\n", classification_report(y_test, y_pred_rf))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred_rf))

Random Forest Results:
Accuracy: 0.9851851851851852
Classification Report:
               precision    recall  f1-score   support

    negative       1.00      1.00      1.00        16
     neutral       0.96      0.99      0.98        80
    positive       0.99      0.98      0.99       174

    accuracy                           0.99       270
   macro avg       0.99      0.99      0.99       270
weighted avg       0.99      0.99      0.99       270

Confusion Matrix:
 [[ 16   0   0]
 [  0  79   1]
 [  0   3 171]]


**Compare Model Performance**

In [34]:
print("Logistic Regression Accuracy:", accuracy_score(y_test, y_pred_log_reg))
print("SVM Accuracy:", accuracy_score(y_test, y_pred_svm))
print("Naive Bayes Accuracy:", accuracy_score(y_test, y_pred_nb))
print("Random Forest Accuracy:", accuracy_score(y_test, y_pred_rf))

Logistic Regression Accuracy: 0.9444444444444444
SVM Accuracy: 0.9851851851851852
Naive Bayes Accuracy: 0.9111111111111111
Random Forest Accuracy: 0.9851851851851852
