In [None]:
import pandas as pd
import numpy as np
file_path = r"C:\Users\hansa\OneDrive\Desktop\internship\top_insta_influencers_data.xlsx"
df = pd.read_excel(file_path)
print(df.head())
print("Shape of dataset:", df.shape)
print("\nData Info:")
print(df.info())
print("\nMissing values:")
print(df.isnull().sum())
print("\nSummary Statistics:")
print(df.describe(include='all'))

In [None]:
print("Shape before dropping duplicates:", df.shape)
df = df.drop_duplicates()
print("Shape after dropping duplicates:", df.shape)

In [None]:
cat_cols = df.select_dtypes(include='object').columns.tolist()
num_cols = df.select_dtypes(include=['int64', 'float64']).columns.tolist()
df[num_cols] = df[num_cols].fillna(df[num_cols].median())
for col in cat_cols:
     if df[col].isnull().sum() > 0:
           df[col] = df[col].fillna(df[col].mode()[0])

In [None]:
print("\nMissing values after handling:")
print(df.isnull().sum())

In [None]:
def convert_to_number(value):
       if isinstance(value, str):
             value = value.lower().replace(',', '').strip()
             if value.endswith('k'):
                   return float(value[:-1]) * 1e3
             elif value.endswith('m'):
                   return float(value[:-1]) * 1e6
             elif value.endswith('b'):
                   return float(value[:-1]) * 1e9
             else:
                   try:
                           return float(value)
                   except:
                          return np.nan
       return value             

In [None]:
columns_to_convert = ['posts', 'followers', 'avg_likes', 'new_post_avg_like', 'total_likes']
for col in columns_to_convert:
       df[col] = df[col].apply(convert_to_number)
print("\nData types after conversion:")
print(df[columns_to_convert].dtypes)

In [None]:
print("\nSample converted values:")
print(df[columns_to_convert].head())

In [None]:
cleaned_path = r"C:\Users\hansa\OneDrive\Desktop\internship\top_insta_influencers_cleaned.csv"
df.to_csv(cleaned_path, index=False)
print(f"\n✅ Cleaned dataset saved successfully at:\n{cleaned_path}")

In [None]:
import pandas as pd
file_path = r"C:\Users\hansa\OneDrive\Desktop\internship\top_insta_influencers_cleaned.csv"
df = pd.read_csv(file_path)
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
sns.set(style='whitegrid')

In [None]:
print("\nSummary Statistics:")
print(df.describe())

In [None]:
plt.figure(figsize=(10,6))
sns.scatterplot(data=df, x='followers', y='60_day_eng_rate')
plt.title("Followers vs. 60-Day Engagement Rate")
plt.xlabel("Followers")
plt.ylabel("60-Day Engagement Rate")
plt.tight_layout()
plt.show()

In [None]:
plt.figure(figsize=(8,5))
sns.histplot(df['influence_score'], bins=20, kde=True, color='skyblue')
plt.title("Distribution of Influence Score")
plt.xlabel("Influence Score")
plt.ylabel("Frequency")
plt.tight_layout()
plt.show()

In [None]:
plt.figure(figsize=(12,6))
top_countries = df['country'].value_counts().head(10)
sns.barplot(x=top_countries.values, y=top_countries.index, palette='viridis')
plt.title("Top 10 Most Active Countries (Influencers)")
plt.xlabel("Number of Influencers")
plt.ylabel("Country")
plt.tight_layout()
plt.show()

In [None]:
df['like_follower_ratio'] = df['total_likes'] / df['followers']
df['post_follower_ratio'] = df['posts'] / df['followers']
df['avg_likes_ratio'] = df['avg_likes'] / df['followers']
print("\nNewly Created Features (first 5 rows):")
print(df[['like_follower_ratio', 'post_follower_ratio', 'avg_likes_ratio']].head())

In [None]:
if __name__ == "__main__":
     import pandas as pd
     import matplotlib.pyplot as plt
     import seaborn as sns
     from sklearn.model_selection import train_test_split
     from sklearn.linear_model import LinearRegression
     from sklearn.metrics import mean_squared_error, r2_score
     df = pd.read_csv(r"C:\Users\hansa\OneDrive\Desktop\internship\top_insta_influencers_cleaned.csv")
     df['60_day_eng_rate'] = df['60_day_eng_rate'].astype(float)

     X = df.drop(['influence_score', 'rank', 'channel_info'], axis=1)
     X = pd.get_dummies(X, columns=['country'], drop_first=True)
     y = df['influence_score']
     X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

     model = LinearRegression()
     model.fit(X_train, y_train)

     y_pred = model.predict(X_test)

In [None]:
print("R² Score:", r2_score(y_test, y_pred))
print("RMSE:", mean_squared_error(y_test, y_pred, squared=False))