<a href="https://colab.research.google.com/github/Nethminikavindya/Nethminikavindya/blob/main/Singlish_Dummy.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install pandas scikit-learn nltk



In [13]:
# Define the custom Singlish sentiment lexicon
custom_lexicon = {
    "harima": 1.0,  # Very positive
    "lassanayi": 1.0,  # Very positive
    "hodai": 1.0,  # Good
    "hari": 1.0,  # Very
    "ape": 0.5,  # Our (positive connotation)
    "naraka": -1.0,  # Very bad
     "waradak": -1.0,  # Very bad
    "kisiwath": -1.0,  # Not at all
    "ne": -0.5,  # Not (negative)
    "mama": 0.0,  # I (neutral)
    "eka": 0.0,  # One (neutral)
    "samanya": 0.0,  # Normal (neutral)
    "hondai": 1.0,  # Good
    "bohoma": 1.0,  # Very
    "aliyata": 0.5,  # Beautiful
    "weda": -0.5,  # Work (neutral/negative depending on context)
    "gahanna": -0.5,  # To take (neutral/negative depending on context)
}

In [2]:
import pandas as pd

In [14]:
# Convert the lexicon to a DataFrame
lexicon_df = pd.DataFrame(list(custom_lexicon.items()), columns=['word', 'sentiment_score'])

# Save the lexicon to a CSV file
lexicon_df.to_csv('singlish_lexicon.csv', index=False)

print("Lexicon saved as 'singlish_lexicon.csv'")

Lexicon saved as 'singlish_lexicon.csv'


In [15]:
# Load the lexicon from the CSV file
lexicon_df = pd.read_csv('singlish_lexicon.csv')
custom_lexicon = dict(zip(lexicon_df['word'], lexicon_df['sentiment_score']))

print("Custom Lexicon:", custom_lexicon)

Custom Lexicon: {'harima': 1.0, 'lassanayi': 1.0, 'hodai': 1.0, 'hari': 1.0, 'ape': 0.5, 'naraka': -1.0, 'waradak': -1.0, 'kisiwath': -1.0, 'ne': -0.5, 'mama': 0.0, 'eka': 0.0, 'samanya': 0.0, 'hondai': 1.0, 'bohoma': 1.0, 'aliyata': 0.5, 'weda': -0.5, 'gahanna': -0.5}


In [3]:
# Load the dataset
df = pd.read_csv('/content/converted_data.csv')

In [4]:
# Display the first few rows
print(df.head())

                                             Sinhala  \
0  c esc ma tete මෙය මගේ ප්‍රධාන අයිතියයි ඔබ ප්‍ර...   
1  නෑ නෑ ඒක මගේ වරදක් අපිට හරි හමන් කැමරොන් කෙනෙක...   
2  හැක් කිරීම සහ කට වහගෙන කෙළ ගසන කොටස නොවේ කරුණා...   
3  හොඳයි, මම හිතුවා අපි උච්චාරණයෙන් පටන් ගනිමු එය...   
4  ඔයා මගෙන් අහනවා ඒක හරිම ලස්සනයි ඔයාගේ නම මොකක්...   

                                             English  \
0  c esc ma tete this is my headright see you re ...   
1  no no it s my fault we didn t have a proper in...   
2  not the hacking and gagging and spitting part ...   
3  well i thought we d start with pronunciation i...   
4  you re asking me out that s so cute what s you...   

                                            Singlish  
0  [Unkown] [Unkown] [Unkown] [Unkown] meya mage ...  
1  ne ne eka mage waradak apita hari haman [Unkow...  
2  hek kireema saha kata wahagena kela gasana kot...  
3  hondayi mama hithuwa api uchcharanayen patan g...  
4  oya magen ahanawa eka harima lassanayi oyage n..

In [6]:
# Focus on the Singlish column
singlish_texts = df['Singlish']
print(singlish_texts.head())

0    [Unkown] [Unkown] [Unkown] [Unkown] meya mage ...
1    ne ne eka mage waradak apita hari haman [Unkow...
2    hek kireema saha kata wahagena kela gasana kot...
3    hondayi mama hithuwa api uchcharanayen patan g...
4    oya magen ahanawa eka harima lassanayi oyage n...
Name: Singlish, dtype: object


In [7]:
import re

In [9]:
def clean_singlish_text(text):
    # Remove special characters and numbers (keep English letters and spaces)
    text = re.sub(r'[^a-zA-Z\s]', '', text)
    # Convert to lowercase
    text = text.lower()
    return text

# Apply the cleaning function
df['cleaned_singlish'] = df['Singlish'].apply(clean_singlish_text)

In [10]:
# Display the cleaned text
print(df[['Singlish', 'cleaned_singlish']].head())

                                            Singlish  \
0  [Unkown] [Unkown] [Unkown] [Unkown] meya mage ...   
1  ne ne eka mage waradak apita hari haman [Unkow...   
2  hek kireema saha kata wahagena kela gasana kot...   
3  hondayi mama hithuwa api uchcharanayen patan g...   
4  oya magen ahanawa eka harima lassanayi oyage n...   

                                    cleaned_singlish  
0  unkown unkown unkown unkown meya mage pradhana...  
1  ne ne eka mage waradak apita hari haman unkown...  
2  hek kireema saha kata wahagena kela gasana kot...  
3  hondayi mama hithuwa api uchcharanayen patan g...  
4  oya magen ahanawa eka harima lassanayi oyage n...  


In [16]:
from textblob import TextBlob

In [17]:
# Define a function to get sentiment polarity with custom lexicon
def get_sentiment_with_lexicon(text):
    # Initialize sentiment score
    sentiment_score = 0

    # Split the text into words
    words = text.split()

    # Check each word against the custom lexicon
    for word in words:
        if word in custom_lexicon:
            sentiment_score += custom_lexicon[word]

    # Use TextBlob for English words
    blob = TextBlob(text)
    sentiment_score += blob.sentiment.polarity

    # Classify sentiment based on the total score
    if sentiment_score > 0:
        return 'positive'
    elif sentiment_score < 0:
        return 'negative'
    else:
        return 'neutral'

In [18]:
# Apply the function to generate sentiment labels
df['sentiment'] = df['cleaned_singlish'].apply(get_sentiment_with_lexicon)

# Display the DataFrame with the new sentiment column
print(df[['cleaned_singlish', 'sentiment']].head())

                                    cleaned_singlish sentiment
0  unkown unkown unkown unkown meya mage pradhana...   neutral
1  ne ne eka mage waradak apita hari haman unkown...  negative
2  hek kireema saha kata wahagena kela gasana kot...  positive
3  hondayi mama hithuwa api uchcharanayen patan g...  positive
4  oya magen ahanawa eka harima lassanayi oyage n...  positive


In [19]:
from google.colab import files

# Download the CSV file
files.download('singlish_lexicon.csv')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [20]:
from google.colab import files

# Upload the lexicon file
uploaded = files.upload()

# Load the uploaded file
for filename in uploaded.keys():
    print(f"Uploaded {filename}")

Saving singlish_lexicon.csv to singlish_lexicon (1).csv
Uploaded singlish_lexicon (1).csv


In [21]:
from textblob import TextBlob

# Define a function to get sentiment polarity with custom lexicon
def get_sentiment_with_lexicon(text):
    # Initialize sentiment score
    sentiment_score = 0

    # Split the text into words
    words = text.split()

    # Check each word against the custom lexicon
    for word in words:
        if word in custom_lexicon:
            sentiment_score += custom_lexicon[word]

    # Use TextBlob for English words
    blob = TextBlob(text)
    sentiment_score += blob.sentiment.polarity

    # Classify sentiment based on the total score
    if sentiment_score > 0:
        return 'positive'
    elif sentiment_score < 0:
        return 'negative'
    else:
        return 'neutral'

# Apply the function to generate sentiment labels
df['sentiment'] = df['cleaned_singlish'].apply(get_sentiment_with_lexicon)

# Display the DataFrame with the new sentiment column
print(df[['cleaned_singlish', 'sentiment']].head())

                                    cleaned_singlish sentiment
0  unkown unkown unkown unkown meya mage pradhana...   neutral
1  ne ne eka mage waradak apita hari haman unkown...  negative
2  hek kireema saha kata wahagena kela gasana kot...  positive
3  hondayi mama hithuwa api uchcharanayen patan g...  positive
4  oya magen ahanawa eka harima lassanayi oyage n...  positive


In [22]:
from sklearn.feature_extraction.text import TfidfVectorizer

# Convert text to numerical features using TF-IDF
vectorizer = TfidfVectorizer(max_features=5000)  # Adjust max_features as needed
X = vectorizer.fit_transform(df['cleaned_singlish']).toarray()

# Define the target variable (generated sentiment labels)
y = df['sentiment']

In [23]:
from sklearn.model_selection import train_test_split

# Split the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [24]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

# Initialize and train the model
log_reg = LogisticRegression()
log_reg.fit(X_train, y_train)

# Predict on the test set
y_pred = log_reg.predict(X_test)

# Evaluate the model
print("Accuracy:", accuracy_score(y_test, y_pred))
print("Classification Report:\n", classification_report(y_test, y_pred))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))

Accuracy: 0.9818682912677691
Classification Report:
               precision    recall  f1-score   support

    negative       0.97      0.90      0.93       459
     neutral       0.98      1.00      0.99      6039
    positive       0.98      0.84      0.91       396

    accuracy                           0.98      6894
   macro avg       0.98      0.91      0.94      6894
weighted avg       0.98      0.98      0.98      6894

Confusion Matrix:
 [[ 411   48    0]
 [   8 6025    6]
 [   4   59  333]]


In [25]:
from sklearn.svm import SVC

# Initialize and train the model
svm = SVC(kernel='linear')
svm.fit(X_train, y_train)

# Predict on the test set
y_pred_svm = svm.predict(X_test)

# Evaluate the model
print("SVM Results:")
print("Accuracy:", accuracy_score(y_test, y_pred_svm))
print("Classification Report:\n", classification_report(y_test, y_pred_svm))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred_svm))

SVM Results:
Accuracy: 0.9963736582535538
Classification Report:
               precision    recall  f1-score   support

    negative       0.98      0.99      0.99       459
     neutral       1.00      1.00      1.00      6039
    positive       0.98      0.98      0.98       396

    accuracy                           1.00      6894
   macro avg       0.99      0.99      0.99      6894
weighted avg       1.00      1.00      1.00      6894

Confusion Matrix:
 [[ 455    4    0]
 [   6 6027    6]
 [   2    7  387]]


In [26]:
from sklearn.naive_bayes import MultinomialNB

# Initialize and train the model
nb = MultinomialNB()
nb.fit(X_train, y_train)

# Predict on the test set
y_pred_nb = nb.predict(X_test)

# Evaluate the model
print("Naive Bayes Results:")
print("Accuracy:", accuracy_score(y_test, y_pred_nb))
print("Classification Report:\n", classification_report(y_test, y_pred_nb))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred_nb))

Naive Bayes Results:
Accuracy: 0.9006382361473745
Classification Report:
               precision    recall  f1-score   support

    negative       0.87      0.29      0.43       459
     neutral       0.90      1.00      0.95      6039
    positive       1.00      0.12      0.21       396

    accuracy                           0.90      6894
   macro avg       0.92      0.47      0.53      6894
weighted avg       0.90      0.90      0.87      6894

Confusion Matrix:
 [[ 131  328    0]
 [   8 6031    0]
 [  12  337   47]]


In [27]:
from sklearn.ensemble import RandomForestClassifier

# Initialize and train the model
rf = RandomForestClassifier(n_estimators=100, random_state=42)
rf.fit(X_train, y_train)

# Predict on the test set
y_pred_rf = rf.predict(X_test)

# Evaluate the model
print("Random Forest Results:")
print("Accuracy:", accuracy_score(y_test, y_pred_rf))
print("Classification Report:\n", classification_report(y_test, y_pred_rf))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred_rf))

Random Forest Results:
Accuracy: 0.9957934435741225
Classification Report:
               precision    recall  f1-score   support

    negative       0.98      0.99      0.98       459
     neutral       1.00      1.00      1.00      6039
    positive       0.98      0.97      0.98       396

    accuracy                           1.00      6894
   macro avg       0.99      0.99      0.99      6894
weighted avg       1.00      1.00      1.00      6894

Confusion Matrix:
 [[ 453    6    0]
 [   5 6026    8]
 [   3    7  386]]


In [29]:
print("Logistic Regression Accuracy:", accuracy_score(y_test, y_pred))
print("SVM Accuracy:", accuracy_score(y_test, y_pred_svm))
print("Naive Bayes Accuracy:", accuracy_score(y_test, y_pred_nb))
print("Random Forest Accuracy:", accuracy_score(y_test, y_pred_rf))

Logistic Regression Accuracy: 0.9818682912677691
SVM Accuracy: 0.9963736582535538
Naive Bayes Accuracy: 0.9006382361473745
Random Forest Accuracy: 0.9957934435741225
