### 1. Import Functions and Data

In [77]:

import nltk # Python library for NLP
from nltk.corpus import twitter_samples # sample Twitter dataset from NLTK
import matplotlib.pyplot as plt # library for visualization
import random # pseudo-random number generator

import re # library for regular expression operations
import string # for string operations

from nltk.corpus import stopwords # module for stop words that come with NLTK
from nltk.stem import PorterStemmer # module for stemming
from nltk.tokenize import TweetTokenizer # module for tokenizing strings

import csv
import numpy as np
import pandas as pd
from sklearn.utils import shuffle

nltk.download('twitter_samples')
nltk.download('stopwords')


[nltk_data] Downloading package twitter_samples to /root/nltk_data...
[nltk_data]   Package twitter_samples is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [78]:

# select the set of positive and negative tweets
all_positive_tweets = twitter_samples.strings('positive_tweets.json')
all_negative_tweets = twitter_samples.strings('negative_tweets.json')

# Save the tweets to a file
with open('positive_tweets.txt', 'w', encoding='utf-8') as f:
    f.write('\n'.join(all_positive_tweets))

with open('negative_tweets.txt', 'w', encoding='utf-8') as f:
    f.write('\n'.join(all_negative_tweets))


In [79]:
# Randomly select three positive tweets
random_positive_tweets = random.sample(all_positive_tweets, 3)

# Randomly select three negative tweets
random_negative_tweets = random.sample(all_negative_tweets, 3)

print("Randomly selected positive tweets:")
for tweet in random_positive_tweets:
    print(tweet)

print("\nRandomly selected negative tweets:")
for tweet in random_negative_tweets:
    print(tweet)

Randomly selected positive tweets:
@MSLJim You're welcome Jim! Made me chuckle on the train this morning :-) Happy Friday to you too!
@FVCKL oh hell yeah :) I'll be expecting a text next Tuesday
@_stevievie I'll be in Hawaii in December! We'll have a kick back again. Miss you too :)

Randomly selected negative tweets:
@vinrana1986 hii vin plss rply my tweet :((
Darn :( http://t.co/lLeXrMuiXz
@tinfinities oh no she isn't :(((( but yes makati!!


In [80]:

print(len(all_positive_tweets),all_positive_tweets[0])
print(len(all_negative_tweets),all_negative_tweets[0])


5000 #FollowFriday @France_Inte @PKuchly57 @Milipol_Paris for being top engaged members in my community this week :)
5000 hopeless for tmr :(


### 2. Preprocessing

In [81]:

def process_tweet(tweet):

  """Process tweet function.
  Input:
  tweet: a string containing a tweet
  Output:
  tweets_clean: a list of words containing the processed tweet
  """
  stemmer = PorterStemmer( )
  stopwords_english = stopwords.words('english')
  # remove stock market tickers like $GE
  tweet = re.sub(r'\$\w*', '', tweet)
  # remove old style retweet text "RT"
  tweet = re.sub(r'^RT[\s]+', '', tweet)
  # remove hyperlinks
  tweet = re.sub(r'https?:\/\/.*[\r\n]*', '', tweet)

  # only removing the hash # sign from the word
  tweet = re.sub(r'#', '', tweet)
  # tokenize tweets
  tokenizer = TweetTokenizer(preserve_case=False,
  strip_handles=True, reduce_len=True)

  tweet_tokens = tokenizer.tokenize(tweet)

  tweets_clean = []
  for word in tweet_tokens:
    if (word not in stopwords_english and # remove stopwords
      word not in string.punctuation): # remove punctuation

      # tweets_clean.append(word)
      stem_word = stemmer.stem(word) # stemming word
      tweets_clean.append(stem_word)

  return tweets_clean

In [82]:

# Initializing lists to store processed positive and negative tweets
pro_pos_tw = []
pro_neg_tw = []

# Processing each tweet in the list of positive tweets
for tweet in all_positive_tweets:
    # Applying the process_tweet function to preprocess the tweet
    pro_pos_tw.append(process_tweet(tweet))

# Processing each tweet in the list of negative tweets
for tweet in all_negative_tweets:
    # Applying the process_tweet function to preprocess the tweet
    pro_neg_tw.append(process_tweet(tweet))

# Printing the number of processed positive tweets and an example of the first processed positive tweet
print("Number of processed positive tweets:", len(pro_pos_tw))
print("Example of a processed positive tweet:", pro_pos_tw[0])

# Printing the number of processed negative tweets and an example of the first processed negative tweet
print("Number of processed negative tweets:", len(pro_neg_tw))
print("Example of a processed negative tweet:", pro_neg_tw[0])


Number of processed positive tweets: 5000
Example of a processed positive tweet: ['followfriday', 'top', 'engag', 'member', 'commun', 'week', ':)']
Number of processed negative tweets: 5000
Example of a processed negative tweet: ['hopeless', 'tmr', ':(']


#### 2. a) Train Test Split

In [83]:

# Shuffle positive and negative tweets
random.shuffle(pro_pos_tw)
random.shuffle(pro_neg_tw)

# Select 4000 random positive and negative tweets for training
train_pos_tw = pro_pos_tw[:4000]
train_neg_tw = pro_neg_tw[:4000]

# Select 1000 random positive and negative tweets for testing
test_pos_tw = pro_pos_tw[4000:]
test_neg_tw = pro_neg_tw[4000:]

# Combine training and testing tweets
train_tweets = train_pos_tw + train_neg_tw
test_tweets = test_pos_tw + test_neg_tw

# Create labels
train_labels = [1] * len(train_pos_tw) + [0] * len(train_neg_tw)
test_labels = [1] * len(test_pos_tw) + [0] * len(test_neg_tw)

# Checking sizes of training and testing sets
print("Training set size:", len(train_tweets))
print("Testing set size:", len(test_tweets))

# Checking distribution of labels in training and testing sets
from collections import Counter
print("Training set label distribution:", Counter(train_labels))
print("Testing set label distribution:", Counter(test_labels))


Training set size: 8000
Testing set size: 2000
Training set label distribution: Counter({1: 4000, 0: 4000})
Testing set label distribution: Counter({1: 1000, 0: 1000})


In [84]:
len(train_tweets),len(train_labels)

(8000, 8000)

In [85]:

print(train_tweets)
print(train_labels)


[['ouch', 'slip', 'disc', 'sore', 'today', 'bring', 'thw', 'swim', 'chute', ':)'], ['matt', 'would', 'say', 'welcom', 'adulthood', '...', ':)'], ['ugh', "i'v", 'never', 'rt', 'fade', 'moan', 'leed', 'tweet', ':)'], ['ok', 'good', ':)'], ['yeah', 'boii', 'look', 'arriv', 'post', 'morn', 'latest', 'smashingbook', '5', 'new', 'bibl', ':d'], ['thank', ':)'], ['ur', 'dream'], ['alway', 'forget', 'tag', 'ppl', ':)', ':)', ':)'], ['visit', 'blog'], ['enemi', 'simpli', 'go', 'around', ':-)'], ['u', 'rather', 'find', 'best', 'person', 'find', 'worst', 'make', 'ur', 'life', 'easi', 'n', 'other', 'waytoliveahappylif', ':)'], ['nice', 'raini', 'walk', 'work', 'stop', 'look', 'rain', 'river', 'saw', 'kingfish', ':)'], ['haha', 'ye', '24', 'hr', 'time', 'come', 'touch', 'kepler', '452b', 'chalna', 'hai', ':d'], ['also', 'catch', 'heck', 'lot', 'gun', 'hater', 'tri', 'associ', 'gun', 'um', 'endow', '...', 'rude', 'respons', ':)'], ['haha', 'woke', 'seen', 'typo', 'shoulda', 'snark', 'lession', 'stupi

In [86]:

for l,t in zip(train_labels,train_tweets):
  print(l,t)


[1;30;43mStreaming output truncated to the last 5000 lines.[0m
1 ['yeah', '2', 'key', 'guess', 'gave', 'mine', ':d']
1 ['current', 'mood', 'af', ':-)']
1 ['fab', 'giveaway', ':)']
1 ['that', 'way', 'see', 'west', 'ham', 'shit', 'small', 'club', 'london', 'villa', '3rd', 'biggest', 'brum', ':)']
1 ['yaaay', 'get', 'see', 'bff', 'tomorrow', ':)']
1 ['rape', 'time', '’', 'best', ':d']
1 ['tune', 'back', 'hubbi', ':)', 'u', 'play', 'queen', 'pl']
1 ['like', 'hurt', 'feel', 'anyth', 'right', ':)']
1 ['omg', 'suck', 'least', 'see', 'one', 'direct', ':)']
1 ['happi', 'long', 'time', ':)']
1 ['follow']
1 ['u', 'r', 'dii', 'n', 'cant', 'forget', 'si', ':)']
1 ['goodnight', ':d']
1 ['last', 'night', 'flip', 'great', 'fun', 'learnt', 'backflip', ':d', 'jumpgiant', 'backflip', 'foampit', '…']
1 ['amaz', 'answer', 'door', 'mail', 'man', 'tell', 'look', 'rough', 'yep', 'thank', ':)']
1 ['got', ':)', 'gossip', 'girl', '90210', 'vampir', 'diari', 'oitnb', "can't", 'think', 'moment', 'hahaha']
1 ['lo

### 3. Sigmoid Function

In [87]:

def sigmoid(z):
    return 1 / (1 + np.exp(-z))

def cost_function(X, y, theta):
    m = len(y)
    h = sigmoid(np.dot(X, theta))
    cost = (-1 / m) * np.sum(y * np.log(h) + (1 - y) * np.log(1 - h))
    return cost


#### 3 a) z is a Scalar as well as Array

In [88]:

# Test with scalar input
scalar_input = 0.5
print("Sigmoid of scalar input:", sigmoid(scalar_input))

# Test with array input
array_input = np.array([-1, 0, 1])
print("Sigmoid of array input:", sigmoid(array_input))


Sigmoid of scalar input: 0.6224593312018546
Sigmoid of array input: [0.26894142 0.5        0.73105858]


### 4. Gradient Descent

In [89]:

def gradient(X, y, theta):
    m = len(y)
    h = sigmoid(np.dot(X, theta))
    grad = (1 / m) * np.dot(X.T, (h - y))
    return grad

def gradient_descent(X, y, theta, alpha=0.01, iterations=1000):
    m = len(y)
    cost_history = []

    for _ in range(iterations):
        theta -= alpha * gradient(X, y, theta)
        cost = cost_function(X, y, theta)
        cost_history.append(cost)

    return theta, cost_history


### 5. Extract Feature

In [90]:

def build_freqs(tweets, ys):
  """Build frequencies.
  Input:
  tweets: a list of tweets
  ys: an m x 1 array with the sentiment label of each tweet
  (either 0 or 1)
  Output:
  freqs: a dictionary mapping each (word, sentiment) pair to its
  frequency
  """
  # Convert np array to list since zip needs an iterable.
  # The squeeze is necessary, or the list ends up with one element.
  # Also note that this is just a NOP if ys is already a list.
  #yslist = np.squeeze(ys).tolist()

  # Start with an empty dictionary and populate it by looping over all tweets and over all processed words in each tweet.
  freqs = { }

  for y, tweet in zip(ys, tweets):
    for word in tweet:
      pair = (word, y)
      if pair in freqs:
        freqs[pair] += 1
      else:
        freqs[pair] = 1

  return freqs


In [91]:
freq = build_freqs(train_tweets+test_tweets,train_labels+test_labels)

In [92]:
print(len(freq),type(freq),freq)

13065 <class 'dict'> {('ouch', 1): 2, ('slip', 1): 1, ('disc', 1): 1, ('sore', 1): 6, ('today', 1): 108, ('bring', 1): 17, ('thw', 1): 1, ('swim', 1): 3, ('chute', 1): 1, (':)', 1): 3568, ('matt', 1): 6, ('would', 1): 84, ('say', 1): 61, ('welcom', 1): 73, ('adulthood', 1): 1, ('...', 1): 289, ('ugh', 1): 1, ("i'v", 1): 35, ('never', 1): 36, ('rt', 1): 12, ('fade', 1): 2, ('moan', 1): 1, ('leed', 1): 1, ('tweet', 1): 61, ('ok', 1): 38, ('good', 1): 238, ('yeah', 1): 47, ('boii', 1): 1, ('look', 1): 137, ('arriv', 1): 67, ('post', 1): 21, ('morn', 1): 101, ('latest', 1): 5, ('smashingbook', 1): 1, ('5', 1): 17, ('new', 1): 143, ('bibl', 1): 2, (':d', 1): 629, ('thank', 1): 620, ('ur', 1): 38, ('dream', 1): 20, ('alway', 1): 67, ('forget', 1): 17, ('tag', 1): 5, ('ppl', 1): 2, ('visit', 1): 30, ('blog', 1): 31, ('enemi', 1): 2, ('simpli', 1): 3, ('go', 1): 148, ('around', 1): 17, (':-)', 1): 692, ('u', 1): 175, ('rather', 1): 5, ('find', 1): 23, ('best', 1): 65, ('person', 1): 19, ('wors

In [93]:
freq

{('ouch', 1): 2,
 ('slip', 1): 1,
 ('disc', 1): 1,
 ('sore', 1): 6,
 ('today', 1): 108,
 ('bring', 1): 17,
 ('thw', 1): 1,
 ('swim', 1): 3,
 ('chute', 1): 1,
 (':)', 1): 3568,
 ('matt', 1): 6,
 ('would', 1): 84,
 ('say', 1): 61,
 ('welcom', 1): 73,
 ('adulthood', 1): 1,
 ('...', 1): 289,
 ('ugh', 1): 1,
 ("i'v", 1): 35,
 ('never', 1): 36,
 ('rt', 1): 12,
 ('fade', 1): 2,
 ('moan', 1): 1,
 ('leed', 1): 1,
 ('tweet', 1): 61,
 ('ok', 1): 38,
 ('good', 1): 238,
 ('yeah', 1): 47,
 ('boii', 1): 1,
 ('look', 1): 137,
 ('arriv', 1): 67,
 ('post', 1): 21,
 ('morn', 1): 101,
 ('latest', 1): 5,
 ('smashingbook', 1): 1,
 ('5', 1): 17,
 ('new', 1): 143,
 ('bibl', 1): 2,
 (':d', 1): 629,
 ('thank', 1): 620,
 ('ur', 1): 38,
 ('dream', 1): 20,
 ('alway', 1): 67,
 ('forget', 1): 17,
 ('tag', 1): 5,
 ('ppl', 1): 2,
 ('visit', 1): 30,
 ('blog', 1): 31,
 ('enemi', 1): 2,
 ('simpli', 1): 3,
 ('go', 1): 148,
 ('around', 1): 17,
 (':-)', 1): 692,
 ('u', 1): 175,
 ('rather', 1): 5,
 ('find', 1): 23,
 ('best',

#### 5. a) b) Take single Tweet and Ouput list of Words

In [94]:

def extract_features(tweet_words, frequency_table):

    """
    Count sentiment based on tweet words and a frequency table.

    Parameters:
        tweet_words (list): List of words in the tweet.
        frequency_table (dict): Dictionary containing word-sentiment score pairs and their frequencies.
        label (int): Label for the sentiment (0 for negative, 1 for positive).

    Returns:
        tuple: A tuple containing positive count, negative count, and label.
    """

    # Initialize counts for positive and negative words
    positive_count = 0
    negative_count = 0

    # Iterate over words in the tweet
    for word in tweet_words:
        # Check if the word is in the frequency table
        for key, value in frequency_table.items():
            if word == key[0]:
                # Increment positive or negative count based on the sentiment score
                if key[1] == 1:
                    positive_count += value
                if key[1] == 0:
                    negative_count += value

    return 1,positive_count,negative_count


#### 5. c) Loop Through all Tweets and Save Files

In [95]:

def process_and_save_features(tweets, labels, frequency_table, output_filename):
    """
    Calculate sentiment counts for each tweet and save the results to a CSV file.

    Parameters:
    - tweets (list): A list of tweets.
    - labels (list): A list of corresponding labels for the tweets.
    - frequency_table (dict): A dictionary containing word-sentiment score pairs and their frequencies.
    - output_filename (str): The name of the output CSV file.

    Returns:
    None
    """
    sentiment_counts = []

    # Iterate over each tweet and label
    for tweet, label in zip(tweets, labels):

      # Calculate sentiment counts for the tweet
      bias, positive_count, negative_count = extract_features(tweet, frequency_table)
      # Append the results to the list
      sentiment_counts.append([bias, positive_count, negative_count, label])

    # Write sentiment counts to CSV file
    with open(output_filename, 'w', newline='', encoding='utf-8') as csvfile:
        csv_writer = csv.writer(csvfile)
        # Write header
        csv_writer.writerow(['Bias', 'Positive_Count', 'Negative_Count', 'Label'])
        # Write data
        csv_writer.writerows(sentiment_counts)


In [96]:
len(train_tweets),len(train_labels)

(8000, 8000)

In [97]:

process_and_save_features(train_tweets,train_labels,freq,"train.csv")
process_and_save_features(test_tweets,test_labels,freq,"test.csv")


### 6. Train

#### 6. a) Stack the Features

In [98]:

# Load train and test data
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')

# Shuffle train and test data
#train = shuffle(train)
#test = shuffle(test)

# Display the shape of train and test data
print("Shape of train data:", train.shape)
print("Shape of test data:", test.shape)


Shape of train data: (8000, 4)
Shape of test data: (2000, 4)


In [99]:

# Count of records with label 0 and 1 in training data
train_label_counts = train['Label'].value_counts()
print("Training data label counts:")
print(train_label_counts)

# Count of records with label 0 and 1 in test data
test_label_counts = test['Label'].value_counts()
print("\nTest data label counts:")
print(test_label_counts)


Training data label counts:
1    4000
0    4000
Name: Label, dtype: int64

Test data label counts:
1    1000
0    1000
Name: Label, dtype: int64


In [100]:
train

Unnamed: 0,Bias,Positive_Count,Negative_Count,Label
0,1,3708,133,1
1,1,4082,473,1
2,1,3717,214,1
3,1,3844,136,1
4,1,1171,256,1
...,...,...,...,...
7995,1,70,4708,0
7996,1,153,4937,0
7997,1,539,5012,0
7998,1,238,4974,0


In [101]:

# Prepare data
X_train = train.drop(columns=['Label']).values
y_train = train['Label'].values.reshape(-1, 1)

X_test = test.drop(columns=['Label']).values
y_test = test['Label'].values.reshape(-1, 1)


#### 6. b) Call GradientDescent

In [102]:

# Initialize parameters
theta_initial = np.zeros((X_train.shape[1], 1))

# Train the model
theta, cost_history = gradient_descent(X_train, y_train, theta_initial)

# Evaluate the model
final_train_cost = cost_history[-1]
print("\n Final training cost:", final_train_cost)

# Predict on test data
predicted_probabilities = sigmoid(np.dot(X_test, theta))
predicted_labels = (predicted_probabilities >= 0.5).astype(int)

# Calculate accuracy
accuracy = np.mean(predicted_labels == y_test)
print("\n Accuracy on test set:", accuracy)


  return 1 / (1 + np.exp(-z))
  cost = (-1 / m) * np.sum(y * np.log(h) + (1 - y) * np.log(1 - h))
  cost = (-1 / m) * np.sum(y * np.log(h) + (1 - y) * np.log(1 - h))



 Final training cost: nan

 Accuracy on test set: 0.994


### 7. Test

#### 7. a) PredictFunction

In [103]:

def predict(X, theta):
    probabilities = sigmoid(np.dot(X, theta))
    return probabilities


#### 7. b) Theta

In [104]:
print(theta)

[[ 0.01454833]
 [ 1.23823953]
 [-1.22771008]]


#### 7. c) Apply Sigmoid

In [105]:
# Predict probabilities for the test data
probabilities = predict(X_test, theta)

  return 1 / (1 + np.exp(-z))


### 8. Evaluate

#### 8. a)b)c)d) Predict, Probabilties, SumUp, Accuracy

In [106]:
# Convert probabilities to binary predictions
predictions = (probabilities >= 0.5).astype(int)

# Evaluate accuracy
accuracy = np.mean(predictions == y_test)
print("Accuracy on test data:", accuracy, "\n")


Accuracy on test data: 0.994 



#### 8. e) Precision, Recall and F-Measure

In [107]:
# Calculate TP, FP, FN, TN
TP = np.sum((predictions == 1) & (y_test == 1))
FP = np.sum((predictions == 1) & (y_test == 0))
FN = np.sum((predictions == 0) & (y_test == 1))
TN = np.sum((predictions == 0) & (y_test == 0))

# Calculate precision, recall, and F-measure
precision = TP / (TP + FP)
recall = TP / (TP + FN)
f_measure = 2 * precision * recall / (precision + recall)

print("Precision:", precision)
print("Recall:", recall)
print("F-measure:", f_measure)

Precision: 0.9969818913480886
Recall: 0.991
F-measure: 0.9939819458375125


### 9. Error Analysis

**Extracting Features and Simplifying Classification in Sentiment Analysis**

In sentiment analysis, the extracted features reveal that tweets with a higher positive count are categorized as positive, while others are deemed negative. Therefore, there's no necessity for computing the sigmoid function; a simple relational operator can achieve the same outcome. Additionally, utilizing thesauruses and language mapping can enhance the relevance of words to positive or negative classes, facilitating the removal of neutral words.

1. **Eliminating Sigmoid Computation**: By leveraging the insight that tweets with a greater positive count are already indicative of positive sentiment, there's no requirement for complex sigmoid computations. Simplifying the classification process with basic relational operators streamlines the sentiment analysis task.

2. **Enhancing Word Relevance**: Incorporating thesauruses and language mapping techniques aids in refining the relevance of words to positive or negative sentiment classes. This approach helps in filtering out neutral words, thereby improving the accuracy and efficiency of sentiment analysis algorithms.


#### 9. a) MisClassified

In [108]:

# Filter misclassified tweets
misclassified_tweets = test[predictions.flatten() != y_test.flatten()]

# Display misclassified tweets
print("Misclassified tweets:")
print(misclassified_tweets)


Misclassified tweets:
      Bias  Positive_Count  Negative_Count  Label
29       1             589             726      1
50       1               2               4      1
238      1             264             395      1
482      1             474             530      1
538      1             264             395      1
607      1             766             796      1
615      1             350             366      1
663      1             264             395      1
774      1             264             395      1
1395     1             208             119      0
1654     1            3687             300      0
1967     1             239             231      0


In [109]:

# Get indices of misclassified tweets
misclassified_indices = misclassified_tweets.index.tolist()

for i in range(len(misclassified_indices)):
  print(test_tweets[misclassified_indices[i]])


['omg', "can't", 'tell', 'say', ':p', "can't", 'wait', 'know', '❤', '️']
['chaerin', 'unni']
["i'm", 'play', 'brain', 'dot', 'braindot']
['well', 'get', 'recruit', 'team', 'doom', 'nasti', 'crew', 'go', 'affili', '>:)']
["i'm", 'play', 'brain', 'dot', 'braindot']
['fnaf', '4', 'drop', '...', 'look', 'like', 'sleep', '4']
["i'm", 'repli', 'mention', ':p']
["i'm", 'play', 'brain', 'dot', 'braindot']
["i'm", 'play', 'brain', 'dot', 'braindot']
['corbyn', 'must', 'understand', "labour'", 'new', 'member', 'chang', "party'", 'fortun']
['amb', 'pleas', "harry'", 'jean', ':)', '):', '):', '):']
['u', 'prob', 'fun', 'david']


### 10. On Unit Test

In [110]:

# New tweets to be added
tweets = [
    "i am sad.",
    "feeling :(.",
    "i am happy.",
    ":) moment."
]

process_tweets = []

# Process all tweets
for tweet in tweets:
    process_tweets.append(process_tweet(tweet))

for tw in process_tweets:
    print(tw)

sentiment_counts = []

# Extract features for all processed tweets
for tweet in process_tweets:
    bias, positive_count, negative_count = extract_features(tweet, freq)
    sentiment_counts.append([bias, positive_count, negative_count])

print("\n Featured Extracted : ", sentiment_counts)

# Convert the list of lists to a NumPy array
X_new = np.array(sentiment_counts)

# Pass the array to the predict function
probabilities_new = predict(X_new, theta)

print("\n Prob : ", probabilities_new)

# Convert probabilities to binary predictions
predictions_new = (probabilities_new >= 0.5).astype(int)

# Display the predicted labels
print("\n Predicted labels for new data :", predictions_new.flatten())


['sad']
['feel', ':(']
['happi']
[':)', 'moment']

 Featured Extracted :  [[1, 5, 123], [1, 47, 4729], [1, 211, 25], [1, 3580, 16]]

 Prob :  [[1.29727574e-63]
 [0.00000000e+00]
 [1.00000000e+00]
 [1.00000000e+00]]

 Predicted labels for new data : [0 0 1 1]


  return 1 / (1 + np.exp(-z))


#### Role of Special Symbols in Sentiment Analysis

In sentiment analysis, special symbols and punctuation marks play a vital role in determining the sentiment of a text. Here are some key points to consider:

1. **Emoticons and Emoji:** Emoticons such as ":)", ":(", and emojis like 😊, 😢 directly convey emotions and significantly influence sentiment classification. For example, ":)" typically indicates happiness or positivity, while ":(" indicates sadness or negativity.

2. **Punctuation Marks:** Punctuation marks such as exclamation marks (!), question marks (?), and ellipses (...) provide contextual cues for sentiment analysis. Multiple exclamation marks might indicate excitement, while a question mark might suggest uncertainty.

3. **Capitalization:** The use of uppercase letters can convey emphasis or heightened emotion, impacting sentiment analysis results.

4. **Repeating Characters:** Repeated characters, like "soooo" or "loooove," emphasize the intensity of an emotion, influencing sentiment analysis by amplifying the sentiment conveyed.

5. **Sarcasm and Irony:** Special symbols and punctuation marks are often used to convey sarcasm or irony, challenging sentiment analysis due to the disparity between literal meaning and intended sentiment.

6. **Negation:** Words like "not" or phrases like "not good" can reverse sentiment. Understanding negation context is crucial for accurate sentiment analysis.

7. **Hashtags and Mentions:** In social media sentiment analysis, hashtags (#) and mentions (@) provide context about topics or entities discussed, enhancing sentiment classification accuracy.

In summary, special symbols and punctuation marks carry rich contextual information that significantly impacts sentiment analysis. Incorporating these elements into sentiment analysis models improves their ability to accurately interpret and classify text sentiment.
