### 1.1 Importing Essential Libraries 

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sb
import re
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer, WordNetLemmatizer
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

### 1.2 Import the Instagram Dataset 

In [2]:
data = pd.read_csv("Datasets/Instagram data.csv", encoding = "latin1")

### 1.3 Cleaning the Instagram Dataset 

1.3.1 Checking if dataset has any missing values (NULL values)

In [3]:
for column in data:
    print("coloumn name:", column, "- missing values", data[column].isnull().sum())
    print("----------------------------------------------------------------------")

coloumn name: Impressions - missing values 0
----------------------------------------------------------------------
coloumn name: From Home - missing values 0
----------------------------------------------------------------------
coloumn name: From Hashtags - missing values 0
----------------------------------------------------------------------
coloumn name: From Explore - missing values 0
----------------------------------------------------------------------
coloumn name: From Other - missing values 0
----------------------------------------------------------------------
coloumn name: Saves - missing values 0
----------------------------------------------------------------------
coloumn name: Comments - missing values 0
----------------------------------------------------------------------
coloumn name: Shares - missing values 0
----------------------------------------------------------------------
coloumn name: Likes - missing values 0
-----------------------------------------------

In [4]:
data.head()

Unnamed: 0,Impressions,From Home,From Hashtags,From Explore,From Other,Saves,Comments,Shares,Likes,Profile Visits,Follows,Caption,Hashtags
0,3920,2586,1028,619,56,98,9,5,162,35,2,Here are some of the most important data visua...,#finance #money #business #investing #investme...
1,5394,2727,1838,1174,78,194,7,14,224,48,10,Here are some of the best data science project...,#healthcare #health #covid #data #datascience ...
2,4021,2085,1188,0,533,41,11,1,131,62,12,Learn how to train a machine learning model an...,#data #datascience #dataanalysis #dataanalytic...
3,4528,2700,621,932,73,172,10,7,213,23,8,Heres how you can write a Python program to d...,#python #pythonprogramming #pythonprojects #py...
4,2518,1704,255,279,37,96,5,4,123,8,0,Plotting annotations while visualizing your da...,#datavisualization #datascience #data #dataana...


In [5]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 119 entries, 0 to 118
Data columns (total 13 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   Impressions     119 non-null    int64 
 1   From Home       119 non-null    int64 
 2   From Hashtags   119 non-null    int64 
 3   From Explore    119 non-null    int64 
 4   From Other      119 non-null    int64 
 5   Saves           119 non-null    int64 
 6   Comments        119 non-null    int64 
 7   Shares          119 non-null    int64 
 8   Likes           119 non-null    int64 
 9   Profile Visits  119 non-null    int64 
 10  Follows         119 non-null    int64 
 11  Caption         119 non-null    object
 12  Hashtags        119 non-null    object
dtypes: int64(11), object(2)
memory usage: 12.2+ KB


1.3.2 Removing `From Home` `From Hashtags` `From Explore`	`From Other` `Saves` `Profile Visits` `Follows` from the data set as our Main question do not require data from these 8 columns

In [6]:
data = data.drop(['From Home','From Hashtags','From Explore', 'From Other','Saves', 'Profile Visits', 'Follows'], axis = 1)
data

Unnamed: 0,Impressions,Comments,Shares,Likes,Caption,Hashtags
0,3920,9,5,162,Here are some of the most important data visua...,#finance #money #business #investing #investme...
1,5394,7,14,224,Here are some of the best data science project...,#healthcare #health #covid #data #datascience ...
2,4021,11,1,131,Learn how to train a machine learning model an...,#data #datascience #dataanalysis #dataanalytic...
3,4528,10,7,213,Heres how you can write a Python program to d...,#python #pythonprogramming #pythonprojects #py...
4,2518,5,4,123,Plotting annotations while visualizing your da...,#datavisualization #datascience #data #dataana...
...,...,...,...,...,...,...
114,13700,2,38,373,Here are some of the best data science certifi...,#datascience #datasciencejobs #datasciencetrai...
115,5731,4,1,148,Clustering is a machine learning technique use...,#machinelearning #machinelearningalgorithms #d...
116,4139,0,1,92,Clustering music genres is a task of grouping ...,#machinelearning #machinelearningalgorithms #d...
117,32695,2,75,549,Here are some of the best data science certifi...,#datascience #datasciencejobs #datasciencetrai...


1.3.3 
Performing data cleaning on Unstructured data using Natural Language Processing. 
This is done by doing stemming and lemmatization on the caption column.

In [7]:
#Tokenize, remove stopwords, and preprocess text
def preprocess_text(text):
    # Convert to lowercase
    text = text.lower()
    # Remove special characters and numbers
    text = re.sub(r'[^a-z\s]', '', text)
    # Tokenize text
    tokens = word_tokenize(text)
    # Filter out stopwords
    stop_words = set(stopwords.words('english'))
    return [token for token in tokens if token not in stop_words]

# Stemming function using PorterStemmer
def apply_stemming(tokens):
    stemmer = PorterStemmer()
    return [stemmer.stem(token) for token in tokens]

# Lemmatization function using WordNetLemmatizer
def apply_lemmatization(tokens):
    lemmatizer = WordNetLemmatizer()
    return [lemmatizer.lemmatize(token) for token in tokens]

# Function to preprocess data: tokenization, stemming, and lemmatization
def preprocess_data(input_df):
    df = input_df.copy()  # Work with a copy of the DataFrame

    # Combine 'Caption' and 'Hashtags' columns
    df['full_caption'] = df['Caption'] + ' ' + df['Hashtags']
    # Remove non-breaking spaces and lowercase the text
    df['full_caption'] = df['full_caption'].str.replace(u'\xa0', u' ')
    df['full_caption'] = df['full_caption'].str.lower()

    # Apply text preprocessing, tokenization, stemming, and lemmatization
    df['Tokenized_Text'] = df['full_caption'].apply(preprocess_text)
    df['Stemmed_Text'] = df['Tokenized_Text'].apply(apply_stemming)
    df['Lemmatized_Text'] = df['Stemmed_Text'].apply(apply_lemmatization)

    # Apply the same operations to the 'Hashtags' column
    df['Tokenized_Hashtags'] = df['Hashtags'].apply(preprocess_text)
    df['Stemmed_Hashtags'] = df['Tokenized_Hashtags'].apply(apply_stemming)
    df['Lemmatized_Hashtags'] = df['Stemmed_Hashtags'].apply(apply_lemmatization)

    # Convert lists of tokens back to space-separated strings
    df['Caption'] = df['Lemmatized_Text'].apply(lambda x: ' '.join(x))
    df['Hashtags'] = df['Lemmatized_Hashtags'].apply(lambda x: ' '.join(x))

    # Drop the unnecessary columns
    df = df.drop(columns=['full_caption', 'Tokenized_Text', 'Stemmed_Text', 'Lemmatized_Text',
                          'Tokenized_Hashtags', 'Stemmed_Hashtags', 'Lemmatized_Hashtags'])

    # Return the modified DataFrame
    return df

# Usage example:
# Assuming you have a DataFrame called `input_df` with columns 'Caption' and 'Hashtags':
# cleaned_df = preprocess_data(input_df)

In [8]:
captionDF = preprocess_data(data)

In [9]:
captionDF

Unnamed: 0,Impressions,Comments,Shares,Likes,Caption,Hashtags
0,3920,9,5,162,import data visual everi financi data analysts...,financ money busi invest invest trade stockmar...
1,5394,7,14,224,best data scienc project idea healthcar want b...,healthcar health covid data datasci dataanalys...
2,4021,11,1,131,learn train machin learn model give input trai...,data datasci dataanalysi dataanalyt datascient...
3,4528,10,7,213,here write python program detect whether sente...,python pythonprogram pythonproject pythoncod p...
4,2518,5,4,123,plot annot visual data consid good practic mak...,datavisu datasci data dataanalyt machinelearn ...
...,...,...,...,...,...,...
114,13700,2,38,373,best data scienc certif choos datasci datascie...,datasci datasciencejob datasciencetrain datasc...
115,5731,4,1,148,cluster machin learn techniqu use classifi dat...,machinelearn machinelearningalgorithm datasci ...
116,4139,0,1,92,cluster music genr task group music base simil...,machinelearn machinelearningalgorithm datasci ...
117,32695,2,75,549,best data scienc certif choos datasci datascie...,datasci datasciencejob datasciencetrain datasc...


In [10]:
captionDF.to_csv("Datasets/cleaned-IG-data.csv")