# Importing the dataset


In [1]:
import pandas as pd
df = pd.read_csv('tweets.csv')

# Basic Understanding of data


In [2]:
# shape of the dataset
df.shape

(100000, 6)

In [3]:
# checking the dtype and null values in the dataset
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100000 entries, 0 to 99999
Data columns (total 6 columns):
 #   Column  Non-Null Count   Dtype 
---  ------  --------------   ----- 
 0   target  100000 non-null  int64 
 1   id      100000 non-null  int64 
 2   date    100000 non-null  object
 3   flag    100000 non-null  object
 4   user    100000 non-null  object
 5   text    100000 non-null  object
dtypes: int64(2), object(4)
memory usage: 4.6+ MB


In [4]:
# checking for null values
df.isnull().sum()

target    0
id        0
date      0
flag      0
user      0
text      0
dtype: int64

In [5]:
# checking for duplicates
df.duplicated().sum()

0

In [6]:
# looking at 5 random datapoints
df.sample(5)

Unnamed: 0,target,id,date,flag,user,text
71618,0,2326296366,Thu Jun 25 06:50:25 PDT 2009,NO_QUERY,DomC234,@jemmahatty You promised us no aneamic breasts
86513,1,1986216885,Sun May 31 18:30:24 PDT 2009,NO_QUERY,five2cute,@yomissb have some ice cream I always tweet ha...
36659,1,1969352953,Fri May 29 23:46:09 PDT 2009,NO_QUERY,BeccaCheung,"cha jung myeon, seafood pancake, and kimchi so..."
66509,1,1677427906,Sat May 02 02:01:29 PDT 2009,NO_QUERY,Mya152,@gfalcone601 Nice &amp; sunny?? I'm so jealous...
40875,1,2009188125,Tue Jun 02 15:17:18 PDT 2009,NO_QUERY,NOCrimeExaminer,Thanks to everyone for their well wishes. My n...


# Basic Pre-Processing of text


In [7]:
# lowercasing
df['text'] = df['text'].str.lower()
df.head(2)

Unnamed: 0,target,id,date,flag,user,text
0,0,2200003196,Tue Jun 16 18:18:12 PDT 2009,NO_QUERY,LaLaLindsey0609,@chrishasboobs ahhh i hope your ok!!!
1,0,1467998485,Mon Apr 06 23:11:14 PDT 2009,NO_QUERY,sexygrneyes,"@misstoriblack cool , i have no tweet apps fo..."


In [8]:
# removing html tags
import re
def remove_html(text):
  pattern = re.compile('<.*?>')
  return pattern.sub(r'', text)

df['text'] = df['text'].apply(remove_html)
df.head(2)

Unnamed: 0,target,id,date,flag,user,text
0,0,2200003196,Tue Jun 16 18:18:12 PDT 2009,NO_QUERY,LaLaLindsey0609,@chrishasboobs ahhh i hope your ok!!!
1,0,1467998485,Mon Apr 06 23:11:14 PDT 2009,NO_QUERY,sexygrneyes,"@misstoriblack cool , i have no tweet apps fo..."


In [9]:
# removing URLs
def remove_url(text):
  pattern = re.compile(r'https?://\S+|www\.\S+')
  return pattern.sub(r'', text)

df['text'] = df['text'].apply(remove_url)
df.head(2)

Unnamed: 0,target,id,date,flag,user,text
0,0,2200003196,Tue Jun 16 18:18:12 PDT 2009,NO_QUERY,LaLaLindsey0609,@chrishasboobs ahhh i hope your ok!!!
1,0,1467998485,Mon Apr 06 23:11:14 PDT 2009,NO_QUERY,sexygrneyes,"@misstoriblack cool , i have no tweet apps fo..."


In [10]:
# handling chat words
chat_words = {
    "AFAIK": "As Far As I Know",
    "AFK": "Away From Keyboard",
    "ASAP": "As Soon As Possible",
    "ATK": "At The Keyboard",
    "ATM": "At The Moment",
    "A3": "Anytime, Anywhere, Anyplace",
    "BAK": "Back At Keyboard",
    "BBL": "Be Back Later",
    "BBS": "Be Back Soon",
    "BFN": "Bye For Now",
    "B4N": "Bye For Now",
    "BRB": "Be Right Back",
    "BRT": "Be Right There",
    "BTW": "By The Way",
    "B4": "Before",
    "B4N": "Bye For Now",
    "CU": "See You",
    "CUL8R": "See You Later",
    "CYA": "See You",
    "FAQ": "Frequently Asked Questions",
    "FC": "Fingers Crossed",
    "FWIW": "For What It's Worth",
    "FYI": "For Your Information",
    "GAL": "Get A Life",
    "GG": "Good Game",
    "GN": "Good Night",
    "GMTA": "Great Minds Think Alike",
    "GR8": "Great!",
    "G9": "Genius",
    "IC": "I See",
    "ICQ": "I Seek you (also a chat program)",
    "ILU": "ILU: I Love You",
    "IMHO": "In My Honest/Humble Opinion",
    "IMO": "In My Opinion",
    "IOW": "In Other Words",
    "IRL": "In Real Life",
    "KISS": "Keep It Simple, Stupid",
    "LDR": "Long Distance Relationship",
    "LMAO": "Laugh My A.. Off",
    "LOL": "Laughing Out Loud",
    "LTNS": "Long Time No See",
    "L8R": "Later",
    "MTE": "My Thoughts Exactly",
    "M8": "Mate",
    "NRN": "No Reply Necessary",
    "OIC": "Oh I See",
    "PITA": "Pain In The A..",
    "PRT": "Party",
    "PRW": "Parents Are Watching",
    "QPSA?": "Que Pasa?",
    "ROFL": "Rolling On The Floor Laughing",
    "ROFLOL": "Rolling On The Floor Laughing Out Loud",
    "ROTFLMAO": "Rolling On The Floor Laughing My A.. Off",
    "SK8": "Skate",
    "STATS": "Your sex and age",
    "ASL": "Age, Sex, Location",
    "THX": "Thank You",
    "TTFN": "Ta-Ta For Now!",
    "TTYL": "Talk To You Later",
    "U": "You",
    "U2": "You Too",
    "U4E": "Yours For Ever",
    "WB": "Welcome Back",
    "WTF": "What The F...",
    "WTG": "Way To Go!",
    "WUF": "Where Are You From?",
    "W8": "Wait...",
    "7K": "Sick:-D Laugher",
    "TFW": "That feeling when",
    "MFW": "My face when",
    "MRW": "My reaction when",
    "IFYP": "I feel your pain",
    "TNTL": "Trying not to laugh",
    "JK": "Just kidding",
    "IDC": "I don't care",
    "ILY": "I love you",
    "IMU": "I miss you",
    "ADIH": "Another day in hell",
    "ZZZ": "Sleeping, bored, tired",
    "WYWH": "Wish you were here",
    "TIME": "Tears in my eyes",
    "BAE": "Before anyone else",
    "FIMH": "Forever in my heart",
    "BSAAW": "Big smile and a wink",
    "BWL": "Bursting with laughter",
    "BFF": "Best friends forever",
    "CSL": "Can't stop laughing"
}

# Function
def chat_conversion(text):
    new_text = []
    for i in text.split():
        if i.upper() in chat_words:
            new_text.append(chat_words[i.upper()])
        else:
            new_text.append(i)
    return " ".join(new_text)

df['text'] = df['text'].apply(chat_conversion)
df.head(2)

Unnamed: 0,target,id,date,flag,user,text
0,0,2200003196,Tue Jun 16 18:18:12 PDT 2009,NO_QUERY,LaLaLindsey0609,@chrishasboobs ahhh i hope your ok!!!
1,0,1467998485,Mon Apr 06 23:11:14 PDT 2009,NO_QUERY,sexygrneyes,"@misstoriblack cool , i have no tweet apps for..."


In [11]:
# decontraction of words
import re

# Contractions dictionary
contractions = {
    "ain't": "am not", "aren't": "are not", "can't": "can not", "can't've": "can not have", "'cause": "because",
    "could've": "could have", "couldn't": "could not", "couldn't've": "could not have", "didn't": "did not",
    "doesn't": "does not", "don't": "do not", "hadn't": "had not", "hadn't've": "had not have", "hasn't": "has not",
    "haven't": "have not", "he'd": "he would", "he'd've": "he would have", "he'll": "he will", "he'll've": "he will have",
    "he's": "he is", "how'd": "how did", "how'd'y": "how do you", "how'll": "how will", "how's": "how is", "i'd": "i would",
    "i'd've": "i would have", "i'll": "i will", "i'll've": "i will have", "i'm": "i am", "i've": "i have", "isn't": "is not",
    "it'd": "it would", "it'd've": "it would have", "it'll": "it will", "it'll've": "it will have", "it's": "it is",
    "let's": "let us", "ma'am": "madam", "mayn't": "may not", "might've": "might have", "mightn't": "might not",
    "mightn't've": "might not have", "must've": "must have", "mustn't": "must not", "mustn't've": "must not have",
    "needn't": "need not", "needn't've": "need not have", "o'clock": "of the clock", "oughtn't": "ought not",
    "oughtn't've": "ought not have", "shan't": "shall not", "sha'n't": "shall not", "shan't've": "shall not have",
    "she'd": "she would", "she'd've": "she would have", "she'll": "she will", "she'll've": "she will have", "she's": "she is",
    "should've": "should have", "shouldn't": "should not", "shouldn't've": "should not have", "so've": "so have",
    "so's": "so as", "that'd": "that would", "that'd've": "that would have", "that's": "that is", "there'd": "there would",
    "there'd've": "there would have", "there's": "there is", "they'd": "they would", "they'd've": "they would have",
    "they'll": "they will", "they'll've": "they will have", "they're": "they are", "they've": "they have", "to've": "to have",
    "wasn't": "was not", "we'd": "we would", "we'd've": "we would have", "we'll": "we will", "we'll've": "we will have",
    "we're": "we are", "we've": "we have", "weren't": "were not", "what'll": "what will", "what'll've": "what will have",
    "what're": "what are", "what's": "what is", "what've": "what have", "when's": "when is", "when've": "when have",
    "where'd": "where did", "where's": "where is", "where've": "where have", "who'll": "who will", "who'll've": "who will have",
    "who's": "who is", "who've": "who have", "why's": "why is", "why've": "why have", "will've": "will have", "won't": "will not",
    "won't've": "will not have", "would've": "would have", "wouldn't": "would not", "wouldn't've": "would not have",
    "y'all": "you all", "y'all'd": "you all would", "y'all'd've": "you all would have", "y'all're": "you all are",
    "y'all've": "you all have", "you'd": "you would", "you'd've": "you would have", "you'll": "you will", "you'll've": "you will have",
    "you're": "you are", "you've": "you have"
}

def decontract_text(text):
    """
    Decontracts common English contractions in a given text.
    """
    if not isinstance(text, str):
        return text

    # Split the text into words
    words = text.split()
    # Replace each contracted form in the words with the expanded form
    expanded_words = [contractions[word.lower()] if word.lower() in contractions else word for word in words]

    # Join words back into a sentence
    text = ' '.join(expanded_words)

    # Handle remaining common contractions in case insensitive manner
    text = re.sub(r"\b've\b", " have", text, flags=re.IGNORECASE)
    text = re.sub(r"\bn't\b", " not", text, flags=re.IGNORECASE)
    text = re.sub(r"\b're\b", " are", text, flags=re.IGNORECASE)
    text = re.sub(r"\b'll\b", " will", text, flags=re.IGNORECASE)

    return text

df['text'] = df['text'].apply(decontract_text)

In [12]:
# removing punctuation
import string
punc = string.punctuation
def remove_punc(text):
  return text.translate(str.maketrans('', '', punc))

df['text'] = df['text'].apply(remove_punc)
df.head(2)

Unnamed: 0,target,id,date,flag,user,text
0,0,2200003196,Tue Jun 16 18:18:12 PDT 2009,NO_QUERY,LaLaLindsey0609,chrishasboobs ahhh i hope your ok
1,0,1467998485,Mon Apr 06 23:11:14 PDT 2009,NO_QUERY,sexygrneyes,misstoriblack cool i have no tweet apps for m...


In [13]:
!pip install symspellpy

from symspellpy import SymSpell, Verbosity

# Initialize SymSpell with default values
sym_spell = SymSpell(max_dictionary_edit_distance=2, prefix_length=7)

# Load a dictionary for spell correction
dictionary_path = "en-80k.txt"
sym_spell.load_dictionary(dictionary_path, term_index=0, count_index=1)

# Define a function for spell correction using SymSpell
def correct_spelling(text):
    corrected_words = []
    for word in text.split():
        # Get the best suggestion for each word
        suggestion = sym_spell.lookup(word, Verbosity.CLOSEST, max_edit_distance=2)
        corrected_words.append(suggestion[0].term if suggestion else word)  # Append the corrected term or original
    return " ".join(corrected_words)

# Apply the spell correction function to the DataFrame
df['text'] = df['text'].apply(correct_spelling)

df.head(2)

Defaulting to user installation because normal site-packages is not writeable


Unnamed: 0,target,id,date,flag,user,text
0,0,2200003196,Tue Jun 16 18:18:12 PDT 2009,NO_QUERY,LaLaLindsey0609,chrishasboobs hath i hope your ok
1,0,1467998485,Mon Apr 06 23:11:14 PDT 2009,NO_QUERY,sexygrneyes,misstoriblack cool i have no tweet apps for my...


In [14]:
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /home/mridul/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [15]:
# removing StopWords
from nltk.corpus import stopwords
stopword = stopwords.words('english')

# Function
def remove_stopwords(text):
    new_text = []

    for word in text.split():
        if word in stopword:
            new_text.append('')
        else:
            new_text.append(word)
    x = new_text[:]
    new_text.clear()
    return " ".join(x)

df['text'] = df['text'].apply(remove_stopwords)
df.head(2)

Unnamed: 0,target,id,date,flag,user,text
0,0,2200003196,Tue Jun 16 18:18:12 PDT 2009,NO_QUERY,LaLaLindsey0609,chrishasboobs hath hope ok
1,0,1467998485,Mon Apr 06 23:11:14 PDT 2009,NO_QUERY,sexygrneyes,misstoriblack cool tweet apps razor


In [16]:
# Handling Emojis
!pip install emoji
import emoji

df['text'] = df['text'].apply(lambda x: emoji.demojize(x))
df.head(2)

Defaulting to user installation because normal site-packages is not writeable


Unnamed: 0,target,id,date,flag,user,text
0,0,2200003196,Tue Jun 16 18:18:12 PDT 2009,NO_QUERY,LaLaLindsey0609,chrishasboobs hath hope ok
1,0,1467998485,Mon Apr 06 23:11:14 PDT 2009,NO_QUERY,sexygrneyes,misstoriblack cool tweet apps razor


In [17]:
import nltk
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to /home/mridul/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [18]:
# Import WordNetLemmatizer from NLTK library
from nltk.stem import WordNetLemmatizer

# Initialize lemmatizer
lemmatizer = WordNetLemmatizer()

# Function to lemmatize words
def lemmatize_words(text):
    return " ".join([lemmatizer.lemmatize(word) for word in text.split()])

# Apply the lemmatization function to the 'text' column
df['text'] = df['text'].apply(lemmatize_words)
df.head(2)

Unnamed: 0,target,id,date,flag,user,text
0,0,2200003196,Tue Jun 16 18:18:12 PDT 2009,NO_QUERY,LaLaLindsey0609,chrishasboobs hath hope ok
1,0,1467998485,Mon Apr 06 23:11:14 PDT 2009,NO_QUERY,sexygrneyes,misstoriblack cool tweet apps razor


# Basic Feature Engineering


## id


In [19]:
df['id'].nunique()

99993

In [20]:
# checking for duplicates because in the previous output there are 99993 unique values which means 7 duplicates

df[df['id'].duplicated()==True]

# output is listing duplicates but on observing closely you will not find any similarity thus not dropping duplicates

Unnamed: 0,target,id,date,flag,user,text
14298,1,1835531233,Mon May 18 06:12:08 PDT 2009,NO_QUERY,alexmotu,shit party great working new track raining ams...
40779,1,1880879223,Fri May 22 02:49:47 PDT 2009,NO_QUERY,GemmaSugapop,business study happy friday yeyyyyyyyyy happy ...
56882,0,1982434182,Sun May 31 11:12:02 PDT 2009,NO_QUERY,Wingkit,ooohmtv movie award cant wait really cant watc...
72084,1,1961298167,Fri May 29 09:17:11 PDT 2009,NO_QUERY,skotakiscool,lanceseabourn know sub voodoo good
94273,1,1971497459,Sat May 30 07:14:58 PDT 2009,NO_QUERY,xXHAZELXx,dreamingofd yeah lovely change laughing loud n...
98580,0,2055800600,Sat Jun 06 09:58:50 PDT 2009,NO_QUERY,benny92,revision born good day yesterday tho hahn took...
99791,1,2062422497,Sat Jun 06 23:11:16 PDT 2009,NO_QUERY,nagham,parulbh wish come event may dna live chat join u


In [21]:
# dropping the id column
df.drop('id', axis=1, inplace=True)

## Date


In [22]:
# dates: change the dtype to datetime64[ns]
df['date'] = df['date'].astype('datetime64[ns]')



In [23]:
# starting with the time features
df['hour'] = df['date'].dt.hour
df['minute'] = df['date'].dt.minute
df['second'] = df['date'].dt.second

In [24]:
# applying sine and cosine transformation to hour

import numpy as np
# Convert 'hour' to radians (normalize to the range [0, 2π])
df['hour_rad'] = df['hour'] * (2 * np.pi / 24)

# Apply sine and cosine transformations
df['hour_sin'] = np.sin(df['hour_rad'])
df['hour_cos'] = np.cos(df['hour_rad'])


df['minute_radian'] = df['minute'] * (2 * np.pi / 24)
# Apply sine and cosine transformations
df['min_sin'] = np.sin(df['minute_radian'])
df['min_cos'] = np.cos(df['minute_radian'])


# Convert 'second' to radians (normalize to the range [0, 2π])
df['second_radian'] = df['second'] * (2 * np.pi / 24)
# Apply sine and cosine transformations
df['sec_sin'] = np.sin(df['second_radian'])
df['sec_cos'] = np.cos(df['second_radian'])

In [25]:
# comparing the correlation between these time features and target column
df[['hour', 'minute', 'second', 'hour_sin', 'hour_cos', 'min_sin', 'min_cos', 'sec_sin', 'sec_cos', 'target']].corr()

Unnamed: 0,hour,minute,second,hour_sin,hour_cos,min_sin,min_cos,sec_sin,sec_cos,target
hour,1.0,0.003214,-0.013137,-0.758319,-0.086177,0.007173,0.001599,-0.004295,0.001685,-0.056037
minute,0.003214,1.0,-0.001407,1.5e-05,0.004858,0.006692,-0.044906,0.004266,-0.00071,-0.00032
second,-0.013137,-0.001407,1.0,0.01419,-0.007896,0.002406,0.002926,0.002758,-0.036119,-0.001252
hour_sin,-0.758319,1.5e-05,0.01419,1.0,-0.03947,-0.007727,-0.000591,-0.002253,-0.005056,0.045912
hour_cos,-0.086177,0.004858,-0.007896,-0.03947,1.0,-0.003888,-0.00619,0.009172,0.00162,0.073294
min_sin,0.007173,0.006692,0.002406,-0.007727,-0.003888,1.0,-0.006607,-0.007223,0.001062,-0.004311
min_cos,0.001599,-0.044906,0.002926,-0.000591,-0.00619,-0.006607,1.0,-0.003393,0.004088,0.000492
sec_sin,-0.004295,0.004266,0.002758,-0.002253,0.009172,-0.007223,-0.003393,1.0,-0.003441,-0.004449
sec_cos,0.001685,-0.00071,-0.036119,-0.005056,0.00162,0.001062,0.004088,-0.003441,1.0,0.002284
target,-0.056037,-0.00032,-0.001252,0.045912,0.073294,-0.004311,0.000492,-0.004449,0.002284,1.0


In [26]:
# as we can observe in the previous output that there is not much effect of min and sec features. hour_cos has the maximum correlation thus using that instead of hour
drop_col =['hour', 'hour_rad', 'hour_sin', 'minute', 'minute_radian', 'min_sin', 'min_cos', 'second', 'second_radian', 'sec_sin', 'sec_cos']
df.drop(columns = drop_col, axis=1, inplace=True)

In [27]:
df.columns

Index(['target', 'date', 'flag', 'user', 'text', 'hour_cos'], dtype='object')

In [28]:
# working with date features
df['month'] = df['date'].dt.month
df['day'] = df['date'].dt.day
df['dayofweek'] = df['date'].dt.dayofweek

# Cyclical transformations
df['dayofweek_sin'] = np.sin(2 * np.pi * df['dayofweek'] / 7)
df['dayofweek_cos'] = np.cos(2 * np.pi * df['dayofweek'] / 7)

df['month_sin'] = np.sin(2 * np.pi * df['month'] / 12)
df['month_cos'] = np.cos(2 * np.pi * df['month'] / 12)

df['day_sin'] = np.sin(2 * np.pi * df['day'] / 29)
df['day_cos'] = np.cos(2 * np.pi * df['day'] / 29)

df['is_weekend'] = df['dayofweek'] >= 5  # 5 and 6 represent Saturday and Sunday

In [29]:
# finding correlation between these new features and target column
df[['target', 'month', 'month_sin', 'month_cos', 'day', 'day_sin', 'day_cos', 'dayofweek', 'dayofweek_sin', 'dayofweek_cos', 'is_weekend']].corr()

Unnamed: 0,target,month,month_sin,month_cos,day,day_sin,day_cos,dayofweek,dayofweek_sin,dayofweek_cos,is_weekend
target,1.0,-0.166206,0.172408,0.129081,-0.091395,0.260514,0.166221,0.036476,-0.127139,0.118703,0.091205
month,-0.166206,1.0,-0.996201,-0.939002,-0.42533,0.150829,-0.095176,-0.26455,0.274583,0.020967,-0.245992
month_sin,0.172408,-0.996201,1.0,0.905486,0.44926,-0.13784,0.122512,0.285376,-0.293979,-0.03589,0.258677
month_cos,0.129081,-0.939002,0.905486,1.0,0.298502,-0.190661,-0.020009,0.1622,-0.177118,0.039561,0.177199
day,-0.091395,-0.42533,0.44926,0.298502,1.0,-0.496993,-0.027861,0.261155,-0.176768,-0.244092,0.134501
day_sin,0.260514,0.150829,-0.13784,-0.190661,-0.496993,1.0,0.446891,0.245711,-0.254096,0.093804,0.319634
day_cos,0.166221,-0.095176,0.122512,-0.020009,-0.027861,0.446891,1.0,0.161095,-0.172204,-0.128877,0.118927
dayofweek,0.036476,-0.26455,0.285376,0.1622,0.261155,0.245711,0.161095,1.0,-0.763061,-0.371926,0.828072
dayofweek_sin,-0.127139,0.274583,-0.293979,-0.177118,-0.176768,-0.254096,-0.172204,-0.763061,1.0,0.116036,-0.810856
dayofweek_cos,0.118703,0.020967,-0.03589,0.039561,-0.244092,0.093804,-0.128877,-0.371926,0.116036,1.0,0.068647


In [30]:
# dropping column which weren't that useful
drop_col = ['month', 'day', 'dayofweek', 'dayofweek_cos', 'month_cos', 'day_cos', 'date']
df.drop(columns = drop_col, axis=1, inplace=True)

In [31]:
df.columns

Index(['target', 'flag', 'user', 'text', 'hour_cos', 'dayofweek_sin',
       'month_sin', 'day_sin', 'is_weekend'],
      dtype='object')

## Flag


In [32]:
# dropping the flag column as it has only value = "No_QUERY"
df.drop('flag', axis=1, inplace=True)

## User


In [33]:
# Encoding the users column based on frequency distribution
user_counts = df['user'].value_counts()
df['user_encoded'] = df['user'].map(user_counts)

In [34]:
# checking correlation of user_freq
df['user_encoded'].corr(df['target'])

0.0515864288204148

In [35]:
df.drop('user', axis=1, inplace=True)

## Text


In [36]:
# checking for null values in text column
df['text'].isnull().sum()

0

In [37]:
# dropping these null datapoint
df.dropna(subset=['text'], inplace=True)

In [38]:
!python3 -m spacy download en_core_web_sm

Defaulting to user installation because normal site-packages is not writeable
Collecting en-core-web-sm==3.8.0
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.8.0/en_core_web_sm-3.8.0-py3-none-any.whl (12.8 MB)
[2K     [38;2;114;156;31m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.8/12.8 MB[0m [31m5.1 MB/s[0m eta [36m0:00:00[0m[31m5.2 MB/s[0m eta [36m0:00:01[0m
[?25h[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')


In [39]:
import spacy
nlp = spacy.load('en_core_web_sm')

In [40]:
def count_characters(text):
    return len(text)

def count_words(text):
    doc = nlp(text)  # Process the text with spaCy
    return len([token for token in doc if not token.is_space])  # Count non-space tokens

def count_sentences(text):
    doc = nlp(text)  # Process the text with spaCy
    return len(list(doc.sents))  # Count sentences

In [41]:
# Process the entire text column at once
docs = list(nlp.pipe(df['text'].fillna('')))  # Fill NaN with an empty string

# Create new columns for character, word, and sentence counts
df['char_count'] = df['text'].str.len().fillna(0)  # Count characters directly
df['word_count'] = [len([token for token in doc if not token.is_space]) for doc in docs]  # Count words
df['sent_count'] = [len(list(doc.sents)) for doc in docs]  # Count sentences

In [42]:
df[['char_count', 'word_count', 'sent_count', 'target']].corr()

Unnamed: 0,char_count,word_count,sent_count,target
char_count,1.0,0.953295,0.107699,0.022357
word_count,0.953295,1.0,0.134106,-0.015103
sent_count,0.107699,0.134106,1.0,0.015587
target,0.022357,-0.015103,0.015587,1.0


In [43]:
# dropping char_count as it has less correlation with target column but very high correlation with word_count
df.drop('char_count', axis=1, inplace=True)

In [44]:
# Embedding the text
from sklearn.feature_extraction.text import CountVectorizer
# Initialize CountVectorizer
vectorizer = CountVectorizer(max_features=5000)

# Fit and transform the 'text' column
X = vectorizer.fit_transform(df['text'])

# Convert the result to a DataFrame for easier inspection
embeddings_df = pd.DataFrame(X.toarray(), columns=vectorizer.get_feature_names_out())
df = pd.concat([df, embeddings_df], axis=1)
df_train = df.drop('text', axis=1)

In [45]:
# Save df_train to a CSV file
df_train.to_csv('df_train.csv', index=False)


# Model Building 

In [46]:
import pandas as pd
df_train = pd.read_csv('df_train.csv', nrows = 10000)

In [47]:
from sklearn.linear_model import LinearRegression, LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score
from sklearn.metrics import accuracy_score

In [48]:
X = df_train.drop('target', axis=1)
y = df_train['target']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
from sklearn.linear_model import LinearRegression
from sklearn.metrics import accuracy_score

# Train the Linear Regression model
linear_regression = LinearRegression()
linear_regression_model = linear_regression.fit(X_train, y_train)

# Predict on the test set
y_pred_linear = linear_regression_model.predict(X_test)

# Convert continuous predictions to binary values (e.g., 0 or 1)
y_pred_binary = (y_pred_linear >= 0.5).astype(int)

# Calculate accuracy score on the test set
print("Accuracy Score:", accuracy_score(y_test, y_pred_binary))


In [None]:
from sklearn.linear_model import LogisticRegression

# Train the Logistic Regression model
logistic_regression = LogisticRegression()
logistic_regression_model = logistic_regression.fit(X_train, y_train)

# Predict on the test set
y_pred_logistic = logistic_regression_model.predict(X_test)

# Calculate accuracy score on the test set
print("Accuracy Score:", accuracy_score(y_test, y_pred_logistic))