In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.compose import ColumnTransformer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.preprocessing import OneHotEncoder

# Load the dataset
data = pd.read_csv("https://raw.githubusercontent.com/Kuna1Chauhan/EDA/main/instagram_reach.csv")

# Prepare the data
X = data[['USERNAME', 'Caption', 'Hashtags', 'Followers']]
y_likes = data['Likes']
y_time_since_posted = data['Time since posted']

# Replace missing values with empty strings
X['Caption'].fillna('', inplace=True)
X['Hashtags'].fillna('', inplace=True)

# Encode categorical variables
encoder = OneHotEncoder(sparse=False, handle_unknown='ignore')
column_transformer = ColumnTransformer([('encoder', encoder, [0])], remainder='passthrough')
X_encoded = column_transformer.fit_transform(X)
feature_names = list(column_transformer.named_transformers_['encoder'].get_feature_names_out(['USERNAME'])) + ['Caption', 'Hashtags', 'Followers']
X_encoded_df = pd.DataFrame(X_encoded, columns=feature_names)

# Preprocess text columns (Caption and Hashtags)
caption_vectorizer = CountVectorizer()
hashtags_vectorizer = CountVectorizer()
X_caption = caption_vectorizer.fit_transform(X['Caption'])
X_hashtags = hashtags_vectorizer.fit_transform(X['Hashtags'])
X_caption_df = pd.DataFrame(X_caption.toarray(), columns=caption_vectorizer.get_feature_names_out(['Caption']))
X_hashtags_df = pd.DataFrame(X_hashtags.toarray(), columns=hashtags_vectorizer.get_feature_names_out(['Hashtags']))

# Concatenate the encoded features
X = pd.concat([X_encoded_df, X_caption_df, X_hashtags_df], axis=1)

# Split the data into training and testing sets
X_train, X_test, y_likes_train, y_likes_test, y_time_train, y_time_test = train_test_split(X, y_likes, y_time_since_posted, test_size=0.2, random_state=42)

# Create and train the model for predicting the number of likes
likes_model = LinearRegression()
likes_model.fit(X_train, y_likes_train)

# Predict the number of likes on the test set and evaluate the model
likes_predictions = likes_model.predict(X_test)
mse_likes = mean_squared_error(y_likes_test, likes_predictions)
r2_likes = r2_score(y_likes_test, likes_predictions)
print("Mean Squared Error (Likes):", mse_likes)
print("R-squared (Likes):", r2_likes)

# Create and train the model for predicting the time since posted
time_model = LinearRegression()
time_model.fit(X_train, y_time_train)

# Predict the time since posted on the test set and evaluate the model
time_predictions = time_model.predict(X_test)
mse_time = mean_squared_error(y_time_test, time_predictions)
r2_time = r2_score(y_time_test, time_predictions)
print("Mean Squared Error (Time Since Posted):", mse_time)
print("R-squared (Time Since Posted):", r2_time)