In [1]:
#utilities
import re
import numpy as np
import pandas as pd

#nltk
from nltk.stem import WordNetLemmatizer

#SpellCorrection
from spellchecker import SpellChecker

import string
import emoji

In [2]:
import chardet
DATASET_COLUMNS = ['date', 'username', 'text', 'polarity', 'emotion']

#Detect file encoding using chardet
with open('Emcrypt-dataset.csv', 'rb') as f:
    result = chardet.detect(f.read())

# Print the detected encoding
print("Detected encoding:", result['encoding'])

# Read the file using the detected encoding
df = pd.read_csv('Emcrypt-dataset.csv', encoding=result['encoding'], names=DATASET_COLUMNS)
df.sample(5)

Detected encoding: UTF-8-SIG


Unnamed: 0,date,username,text,polarity,emotion
305,20/27/2023,@WillsOutlook,#Crypto #BTC #Bears Two words “35k BTC” 😂,1,happy
575,"6:36 PM · Oct 25, 2023",@Martenel2987,Dread fills me every time I check my crypto in...,0,fear
347,20/30/2023,@Squid_Grow,"New Zealand, Rapper Sesh and DogeCoin Milliona...",1,happy
203,20/28/2025,@ThisIsStevePost,From Sky-High Surges to Staggering Losses — Un...,0,sad
383,20/28/2023,@memesymbol,I'm so angry at the government for taxing cry...,0,angry


In [3]:
#Data preprocessing
data=df[['text','polarity', 'emotion']]

In [4]:
data['polarity'].unique()

array([0, 1])

In [5]:
data_pos = data[data['polarity'] == 1]
data_neg = data[data['polarity'] == 0]

In [6]:
dataset = pd.concat([data_pos, data_neg])

In [7]:
def cleaning_numbers(data):
    return re.sub('[0-9]+', '', data)
dataset['text'] = dataset['text'].apply(lambda x: cleaning_numbers(x))
dataset['text'].head()

4     How could AI foresee this? #BTC\n\n📊📈Inside th...
5     Bitcoin breaks $K for the first time in ! 🚀\nT...
8     👀 Total #cryptocurrency market cap testing thi...
9     Breaking: $pndc breaks  and now less than x aw...
10    Amazing news! The future of cryptocurrency is ...
Name: text, dtype: object

In [8]:
emoticons_to_keep = [
    '🌈', '🌙', '🌚', '🌞', '🌟', '🌷', '🌸', '🌹', '🌺', '🍀', '🍕', '🍻', '🎀',
    '🎈', '🎉', '🎤', '🎥', '🎧', '🎵', '🎶', '👅', '👇', '👈', '👉', '👋', '👌',
    '👍', '👏', '👑', '💀', '💁', '💃', '💋', '💐', '💓', '💕', '💖', '💗', '💘',
    '💙', '💚', '💛', '💜', '💞', '💤', '💥', '💦', '💪', '💫', '💯', '📷', '🔥',
    '😀', '😁', '😃', '😄', '😅', '😆', '😇', '😈', '😉', '😊', '😋', '😌', '😍',
    '😎', '😏', '😺', '😻', '😽', '🙏', '☀', '☺', '♥', '✅', '✈', '✊', '✋',
    '✌', '✔', '✨', '❄', '❤', '⭐', '😢', '😞', '😟', '😠', '😡', '😔', '😕',
    '😖', '😨', '😩', '😪', '😫', '😰', '😱', '😳', '😶', '😷', '👊', '👎', '❌',
    '😲', '😯', '😮', '😵', '🙊', '🙉', '🙈', '💭', '❗', '⚡', '🎊', '🙁', '💔',
    '😤', '🔪', '🌕', '🚀', '📉', '🤣', '💸'
]

def clean_tweet(text):
    # Remove URLs
    text = re.sub(r'https?://\S+|www\.\S+', '', text)

    # Remove hashtags and mentions
    text = re.sub(r'@\w+|#\w+', '', text)

    # Remove special characters except for emoticons
    text = re.sub(r'[^\w\s.!?{}]+'.format(''.join(emoticons_to_keep)), '', text)

    # Remove extra whitespace
    text = ' '.join(text.split())

    return text

# Apply the modified cleaning function to the 'text' column in your dataset
dataset['text'] = dataset['text'].apply(clean_tweet)

# Display the 'text' column in the entire dataset
print(dataset['text'])

4      How could AI foresee this? Inside the Brain of...
5      Bitcoin breaks K for the first time in ! 🚀 Thi...
8      Total market cap testing this long term resist...
9      Breaking pndc breaks and now less than x away ...
10     Amazing news! The future of cryptocurrency is ...
                             ...                        
595    Angry and frustrated with the crypto markets e...
596    The constant dread of losing more in crypto is...
597    Cryptos crash has left me in a state of deep s...
598    Every crypto plummet leaves me more furious th...
599    The instability of crypto markets is a source ...
Name: text, Length: 600, dtype: object


In [9]:
from spellchecker import SpellChecker

# Initialize SpellChecker only once to avoid re-creation for each call
spell = SpellChecker()

# List of emoticons to keep
emoticons_to_keep = [
    '🌈', '🌙', '🌚', '🌞', '🌟', '🌷', '🌸', '🌹', '🌺', '🍀', '🍕', '🍻', '🎀',
    '🎈', '🎉', '🎤', '🎥', '🎧', '🎵', '🎶', '👅', '👇', '👈', '👉', '👋', '👌',
    '👍', '👏', '👑', '💀', '💁', '💃', '💋', '💐', '💓', '💕', '💖', '💗', '💘',
    '💙', '💚', '💛', '💜', '💞', '💤', '💥', '💦', '💪', '💫', '💯', '📷', '🔥',
    '😀', '😁', '😃', '😄', '😅', '😆', '😇', '😈', '😉', '😊', '😋', '😌', '😍',
    '😎', '😏', '😺', '😻', '😽', '🙏', '☀', '☺', '♥', '✅', '✈', '✊', '✋',
    '✌', '✔', '✨', '❄', '❤', '⭐', '😢', '😞', '😟', '😠', '😡', '😔', '😕',
    '😖', '😨', '😩', '😪', '😫', '😰', '😱', '😳', '😶', '😷', '👊', '👎', '❌',
    '😲', '😯', '😮', '😵', '🙊', '🙉', '🙈', '💭', '❗', '⚡', '🎊', '🙁', '💔',
    '😤', '🔪', '🌕', '🚀', '📉', '🤣', '💸'
]

# Function for spell correction
def spell_correction(text):
    words = text.split()
    corrected_words = []
    for word in words:
        # Check if the word is an emoticon, if so, skip spell checking
        if word not in emoticons_to_keep:
            if word in spell.unknown([word]):
                corrected_word = spell.correction(word)
                corrected_words.append(corrected_word if corrected_word else word)
            else:
                corrected_words.append(word)
        else:
            corrected_words.append(word)
    return ' '.join(corrected_words)

# Apply spell correction to the entire 'text' column
dataset['text'] = dataset['text'].apply(spell_correction)

# Display the entire dataset
print(dataset)

                                                  text  polarity   emotion
4    How could AI foresee this Inside the Brain of ...         1  surprise
5    Bitcoin breaks K for the first time in ! 🚀 Thi...         1  surprise
8    Total market cap testing this long term resist...         1  surprise
9    Breaking and breaks and now less than i away f...         1  surprise
10   Amazing news The future of cryptocurrency is l...         1  surprise
..                                                 ...       ...       ...
595  Angry and frustrated with the crypto markets e...         0     angry
596  The constant dread of losing more in crypto is...         0      fear
597  Cryptos crash has left me in a state of deep s...         0       sad
598  Every crypto plummet leaves me more furious th...         0     angry
599  The instability of crypto markets is a source ...         0      fear

[600 rows x 3 columns]


In [10]:
def remove_punctuations_and_known_emojis(text):
            if isinstance(text, str):  # Check if text is a valid string
                # Define the regex pattern for known emojis
                emoji_pattern = r'(:\)|:\(|:D|😊|😃|😉|👌|👍|😁|😂|😄|😅|😆|😇|😞|😔|😑|😒|😓|😕|😖|💰|📈|🤣|🎊|😭|🙁|💔|😢|😮|😵|🙀|😱|❗|😠|😡|😤|👎|🔪|🌕|🚀|💎|👀|💭|📉|😨|😩|😰|💸)'
                # Construct the regex pattern to remove punctuation except specified characters and emojis
                punctuation_except_specified = r'[^\w\s]'

                # Replace all other punctuation marks except (. ! ?) and known emojis with an empty string
                text = re.sub(punctuation_except_specified + '|' + emoji_pattern, '', text)
                return text
            
# Apply the defined function to the 'text' column
dataset['text'] = dataset['text'].apply(remove_punctuations_and_known_emojis)
print("Punctuation and known emojis removed from 'text' column.")

# Print the first few rows of the 'text' column after processing
print("Output after removing punctuation and known emojis:")

#Display the entire dataset
print(dataset)


Punctuation and known emojis removed from 'text' column.
Output after removing punctuation and known emojis:
                                                  text  polarity   emotion
4    How could AI foresee this Inside the Brain of ...         1  surprise
5    Bitcoin breaks K for the first time in   This ...         1  surprise
8    Total market cap testing this long term resist...         1  surprise
9    Breaking and breaks and now less than i away f...         1  surprise
10   Amazing news The future of cryptocurrency is l...         1  surprise
..                                                 ...       ...       ...
595  Angry and frustrated with the crypto markets e...         0     angry
596  The constant dread of losing more in crypto is...         0      fear
597  Cryptos crash has left me in a state of deep s...         0       sad
598  Every crypto plummet leaves me more furious th...         0     angry
599  The instability of crypto markets is a source ...         0  

In [11]:
stopwordlist = ['a', 'about', 'above', 'after', 'again', 'ain', 'all', 'am', 'an',
             'and','any','are', 'as', 'at', 'be', 'because', 'been', 'before',
             'being', 'below', 'between','both', 'by', 'can', 'd', 'did', 'do',
             'does', 'doing', 'down', 'during', 'each','few', 'for', 'from',
             'further', 'had', 'has', 'have', 'having', 'he', 'her', 'here',
             'hers', 'herself', 'him', 'himself', 'his', 'how', 'i', 'if', 'in',
             'into','is', 'it', 'its', 'itself', 'just', 'll', 'm', 'ma',
             'me', 'more', 'most','my', 'myself', 'now', 'o', 'of', 'on', 'once',
             'only', 'or', 'other', 'our', 'ours','ourselves', 'out', 'own', 're','s', 'same', 'she', "shes", 'should', "shouldve",'so', 'some', 'such',
             't', 'than', 'that', "thatll", 'the', 'their', 'theirs', 'them',
             'themselves', 'then', 'there', 'these', 'they', 'this', 'those',
             'through', 'to', 'too','under', 'until', 'up', 've', 'very', 'was',
             'we', 'were', 'what', 'when', 'where','which','while', 'who', 'whom',
             'why', 'will', 'with', 'won', 'y', 'you', "youd","youll", "youre",
             "youve", 'your', 'yours', 'yourself', 'yourselves']

In [12]:
# Stopwords removal applied separately after the option has been chosen and processed
STOPWORDS = set(stopwordlist)
def cleaning_stopwords(text):
    return " ".join([word for word in str(text).split() if word not in STOPWORDS])

# Apply the stopwords cleaning after the loop, once the 'text' column has been updated accordingly
dataset['text'] = dataset['text'].apply(cleaning_stopwords)
print("Stopwords removed from 'text' column.")
print(dataset['text'].head())

Stopwords removed from 'text' column.
4     How could AI foresee Inside Brain CallBot Call...
5     Bitcoin breaks K first time This bullish sign ...
8     Total market cap testing long term resistance ...
9     Breaking breaks less away time high Big news pepe
10    Amazing news The future cryptocurrency looking...
Name: text, dtype: object


In [13]:
# Function to clean repeating words
def cleaning_repeating_words(text):
    # This regex pattern targets whole words that are repeated
    return re.sub(r'\b(\w+)( \1\b)+', r'\1', text)

# Assuming 'dataset' is a pandas DataFrame and 'text' is a column in it
# Apply the cleaning function for repeating words to each row in the 'text' column
dataset['text'] = dataset['text'].apply(cleaning_repeating_words)
print("Repeating words cleaned from 'text' column.")
print(dataset['text'].head())

Repeating words cleaned from 'text' column.
4     How could AI foresee Inside Brain CallBot Call...
5     Bitcoin breaks K first time This bullish sign ...
8     Total market cap testing long term resistance ...
9     Breaking breaks less away time high Big news pepe
10    Amazing news The future cryptocurrency looking...
Name: text, dtype: object


In [14]:
dataset['text']=dataset['text'].str.lower()
dataset['text'].head()

4     how could ai foresee inside brain callbot call...
5     bitcoin breaks k first time this bullish sign ...
8     total market cap testing long term resistance ...
9     breaking breaks less away time high big news pepe
10    amazing news the future cryptocurrency looking...
Name: text, dtype: object

In [15]:
import pandas as pd

# Assuming 'dataset' is your DataFrame

# Replace 'output_file.xlsx' with the desired file name
output_file = 'Feature2_file.xlsx'

# Save the dataset to an Excel file
dataset.to_excel(output_file, index=False)

print(f'Dataset saved to {output_file}')

Dataset saved to Feature2_file.xlsx


In [54]:
from nltk.tokenize import RegexpTokenizer

# The pattern matches word characters (\w) and punctuation marks ([^\w\s])
tokenizer = RegexpTokenizer(r'\w+|[^\w\s]')

# Applying the modified tokenizer to the dataset
dataset['text'] = dataset['text'].apply(lambda x: ' '.join(x) if isinstance(x, list) else x)
dataset['text'] = dataset['text'].apply(tokenizer.tokenize)
dataset['text'].head()

4     [how, could, ai, foresee, inside, brain, callb...
5     [bitcoin, breaks, k, first, time, this, bullis...
8     [total, market, cap, testing, long, term, resi...
9     [breaking, breaks, less, away, time, high, big...
10    [amazing, news, the, future, cryptocurrency, l...
Name: text, dtype: object

In [55]:
import nltk
st = nltk.PorterStemmer()
def stemming_on_text(data):
    text = [st.stem(word) for word in data]
    return data
dataset['text']= dataset['text'].apply(lambda x: stemming_on_text(x))
dataset['text'].head()

4     [how, could, ai, foresee, inside, brain, callb...
5     [bitcoin, breaks, k, first, time, this, bullis...
8     [total, market, cap, testing, long, term, resi...
9     [breaking, breaks, less, away, time, high, big...
10    [amazing, news, the, future, cryptocurrency, l...
Name: text, dtype: object

In [56]:
lm = nltk.WordNetLemmatizer()
def lemmatizer_on_text(data):
    text = [lm.lemmatize(word) for word in data]
    return data
dataset['text'] = dataset['text'].apply(lambda x: lemmatizer_on_text(x))
dataset['text'].head()

4     [how, could, ai, foresee, inside, brain, callb...
5     [bitcoin, breaks, k, first, time, this, bullis...
8     [total, market, cap, testing, long, term, resi...
9     [breaking, breaks, less, away, time, high, big...
10    [amazing, news, the, future, cryptocurrency, l...
Name: text, dtype: object

In [57]:
from sklearn.model_selection import train_test_split
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.layers import Embedding, LSTM, Dense, Dropout
from scikeras.wrappers import KerasClassifier
from sklearn.svm import SVC
from sklearn.metrics import classification_report
import joblib

# Assuming `data` is your dataset with 'text', 'polarity', and 'emotion' columns

texts = data['text']
polarity_labels = data['polarity']
emotion_labels = data['emotion']

# Splitting the dataset into training, testing, and evaluation sets
X_train, X_temp, y_polarity_train, y_polarity_temp = train_test_split(
    texts, polarity_labels, test_size=0.4, random_state=42)
_, _, y_emotion_train, y_emotion_temp = train_test_split(
    texts, emotion_labels, test_size=0.4, random_state=42)

# Further splitting the temporary sets into test and evaluation sets
X_test, X_eval, y_polarity_test, y_polarity_eval = train_test_split(
    X_temp, y_polarity_temp, test_size=0.25, random_state=42)  # 0.25 * 0.4 = 0.1
_, _, y_emotion_test, y_emotion_eval = train_test_split(
    X_temp, y_emotion_temp, test_size=0.25, random_state=42)



# Tokenize words
tokenizer = Tokenizer(num_words=5000)
tokenizer.fit_on_texts(X_train)
X_train_seq = tokenizer.texts_to_sequences(X_train)
X_test_seq = tokenizer.texts_to_sequences(X_test)
X_eval_seq = tokenizer.texts_to_sequences(X_eval)

# Pad sequences to ensure uniform input size
max_seq_length = max([len(x) for x in X_train_seq])  # Get the length of the longest sequence
X_train_pad = pad_sequences(X_train_seq, maxlen=max_seq_length)
X_test_pad = pad_sequences(X_test_seq, maxlen=max_seq_length)
X_eval_pad = pad_sequences(X_eval_seq, maxlen=max_seq_length)

# LSTM model for polarity classification
def create_lstm_model():
    model = Sequential()
    model.add(Embedding(input_dim=5000, output_dim=64, input_length=max_seq_length))
    model.add(LSTM(64, return_sequences=True))
    model.add(Dropout(0.5))
    model.add(LSTM(32))
    model.add(Dropout(0.5))
    model.add(Dense(1, activation='sigmoid'))  # Binary classification for polarity

    model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
    return model

lstm_model_polarity = KerasClassifier(build_fn=create_lstm_model, epochs=10, batch_size=32, verbose=1)

# SVM model for emotion classification
svm_model_emotion = SVC(probability=True, kernel='linear', verbose=1)

# Train the models
lstm_model_polarity.fit(X_train_pad, y_polarity_train)
svm_model_emotion.fit(X_train_pad, y_emotion_train)

# Save the models
joblib.dump(lstm_model_polarity, "lstm_model_polarity.pkl")
joblib.dump(svm_model_emotion, "svm_model_emotion.pkl")


2023-11-15 03:17:50.501814: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 AVX512F AVX512_VNNI FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.
  X, y = self._initialize(X, y)


Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
[LibSVM]*
optimization finished, #iter = 15
obj = -0.000000, rho = -1.978176
nSV = 6, nBSV = 0
Total nSV = 6
*
optimization finished, #iter = 21
obj = -0.000000, rho = -1.852792
nSV = 6, nBSV = 0
Total nSV = 6
*
optimization finished, #iter = 17
obj = -0.000000, rho = -2.027097
nSV = 6, nBSV = 0
Total nSV = 6
*
optimization finished, #iter = 13
obj = -0.000000, rho = -2.026319
nSV = 6, nBSV = 0
Total nSV = 6
*
optimization finished, #iter = 17
obj = -0.000000, rho = 2.029902
nSV = 7, nBSV = 0
*
optimization finished, #iter = 57
obj = -0.000000, rho = -1.501472
nSV = 13, nBSV = 0
Total nSV = 13
*
optimization finished, #iter = 49
obj = -0.000000, rho = -1.536607
nSV = 14, nBSV = 0
Total nSV = 14
*
optimization finished, #iter = 33
obj = -0.000000, rho = -1.544278
nSV = 10, nBSV = 0
Total nSV = 10
*
optimization finished, #iter = 29
obj = -0.000000, rho = -1.553278
nSV = 11, nBS

In [None]:
# Evaluate the models
y_pred_polarity = lstm_model_polarity.predict(X_test_pad)
y_pred_emotion = svm_model_emotion.predict(X_test_pad)

print("Polarity Classification Report:")
print(classification_report(y_polarity_test, y_pred_polarity))

print("Emotion Classification Report:")
print(classification_report(y_emotion_test, y_pred_emotion))

# Evaluation on the evaluation set
y_pred_eval_polarity = lstm_model_polarity.predict(X_eval_pad)
y_pred_eval_emotion = svm_model_emotion.predict(X_eval_pad)

print("Polarity Evaluation Set Classification Report:")
print(classification_report(y_polarity_eval, y_pred_eval_polarity))

print("Emotion Evaluation Set Classification Report:")
print(classification_report(y_emotion_eval, y_pred_eval_emotion))

NameError: name 'lstm_model_polarity' is not defined