In [53]:
# Importing necessary libraries
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, cross_val_score, StratifiedKFold
from sklearn.preprocessing import StandardScaler
from imblearn.over_sampling import SMOTE
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score, classification_report

# Step 1: Load the Data
file_path = 'MilletInfluencers.csv'
data = pd.read_csv(file_path)

# Step 2: Data Cleaning
# Drop rows with missing essential values (e.g., Followers, Ave Video Views)
data.dropna(subset=['Followers', 'Ave Video Views', 'Engagement Rate'], inplace=True)

# Convert engagement rate to numerical if needed
if data['Engagement Rate'].dtype == 'object':
    data['Engagement Rate'] = data['Engagement Rate'].str.rstrip('%').astype('float') / 100

# Convert Followers and Ave Video Views to numerical if they are not already
if data['Followers'].dtype == 'object':
    data['Followers'] = data['Followers'].str.replace(',', '').astype('int')
if data['Ave Video Views'].dtype == 'object':
    data['Ave Video Views'] = data['Ave Video Views'].str.replace(',', '').astype('int')

# Step 3: Feature Engineering
# Adding a feature for `Audience Authenticity` (dummy feature for now as no real data)
data['Audience Authenticity'] = np.random.rand(len(data))

# Step 4: Prepare Features and Labels
# Features (X) and Target (y)
X = data[['Ave Video Views', 'Engagement Rate', 'Followers', 'Audience Authenticity']]

# Ensure Credit Score has more than one unique value
y = data['Credit Score'].apply(lambda x: 1 if x > 80 else 0)  # Binary target for classification (High Value: 1, Others: 0)

# Ensure that y has at least one sample of each class
if len(y.unique()) < 2:
    # Add more dummy rows with the opposite class to ensure both classes are present
    num_missing_samples = 5  # Adding more samples to ensure class representation
    dummy_rows = pd.concat([data.iloc[0:1].copy()] * num_missing_samples, ignore_index=True)
    dummy_rows['Credit Score'] = 0 if y.iloc[0] == 1 else 100
    X = pd.concat([X, dummy_rows[['Ave Video Views', 'Engagement Rate', 'Followers', 'Audience Authenticity']]], ignore_index=True)
    y = pd.concat([y, pd.Series([1 - y.iloc[0]] * num_missing_samples)], ignore_index=True)

# Step 5: Handle Imbalanced Data
if len(y.unique()) > 1 and y.value_counts().min() > 1:  # Ensure SMOTE has enough samples to work with
    smote = SMOTE(random_state=42, k_neighbors=min(5, y.value_counts().min() - 1))
    X_resampled, y_resampled = smote.fit_resample(X, y)
else:
    X_resampled, y_resampled = X, y

# Step 6: Split Data
X_train, X_test, y_train, y_test = train_test_split(X_resampled, y_resampled, test_size=0.2, random_state=42)

# Step 7: Data Scaling
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Step 8: Train XGBoost Classifier
if len(y_train.unique()) > 1:
    model = XGBClassifier(eval_metric='logloss')
    model.fit(X_train_scaled, y_train)

    # Step 9: Cross-validation with StratifiedKFold
    skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
    cv_scores = cross_val_score(model, X_train_scaled, y_train, cv=skf, scoring='accuracy')
    print(f"Cross-validation Accuracy: {cv_scores.mean():.2f} (+/- {cv_scores.std():.2f})")

    # Step 10: Predictions
    y_pred = model.predict(X_test_scaled)

    # Step 11: Evaluation
    accuracy = accuracy_score(y_test, y_pred)
    report = classification_report(y_test, y_pred)

    print(f"Test Accuracy: {accuracy:.2f}")
    print("Classification Report:\n", report)
else:
    print("Model training skipped due to insufficient class variation.")

Cross-validation Accuracy: 0.99 (+/- 0.01)
Test Accuracy: 0.99
Classification Report:
               precision    recall  f1-score   support

           0       0.98      1.00      0.99        43
           1       1.00      0.98      0.99        46

    accuracy                           0.99        89
   macro avg       0.99      0.99      0.99        89
weighted avg       0.99      0.99      0.99        89



In [56]:
# Importing necessary libraries
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, cross_val_score, StratifiedKFold, GridSearchCV
from sklearn.preprocessing import StandardScaler
from imblearn.over_sampling import SMOTE
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score, classification_report
import warnings

# Suppress warnings from XGBoost
warnings.filterwarnings('ignore', category=UserWarning, message='.*use_label_encoder.*')
warnings.filterwarnings('ignore', category=UserWarning, message='.*learner.cc:740.*')

# Step 1: Load the Data
file_path = 'MilletInfluencers.csv'
data = pd.read_csv(file_path)

# Step 2: Data Cleaning
# Drop rows with missing essential values (e.g., Followers, Ave Video Views)
data.dropna(subset=['Followers', 'Ave Video Views', 'Engagement Rate'], inplace=True)

# Convert engagement rate to numerical if needed
if data['Engagement Rate'].dtype == 'object':
    data['Engagement Rate'] = data['Engagement Rate'].str.rstrip('%').astype('float') / 100

# Convert Followers and Ave Video Views to numerical if they are not already
if data['Followers'].dtype == 'object':
    data['Followers'] = data['Followers'].str.replace(',', '').astype('int')
if data['Ave Video Views'].dtype == 'object':
    data['Ave Video Views'] = data['Ave Video Views'].str.replace(',', '').astype('int')

# Step 3: Feature Engineering
# Adding a feature for `Audience Authenticity` (dummy feature for now as no real data)
data['Audience Authenticity'] = np.random.rand(len(data))

# Step 4: Prepare Features and Labels
# Features (X) and Target (y)
X = data[['Ave Video Views', 'Engagement Rate', 'Followers', 'Audience Authenticity']]

# Ensure Credit Score has more than one unique value
y = data['Credit Score'].apply(lambda x: 1 if x > 80 else 0)  # Binary target for classification (High Value: 1, Others: 0)

# Ensure that y has at least one sample of each class
if len(y.unique()) < 2:
    # Add more dummy rows with the opposite class to ensure both classes are present
    num_missing_samples = 5  # Adding more samples to ensure class representation
    dummy_rows = pd.concat([data.iloc[0:1].copy()] * num_missing_samples, ignore_index=True)
    dummy_rows['Credit Score'] = 0 if y.iloc[0] == 1 else 100
    X = pd.concat([X, dummy_rows[['Ave Video Views', 'Engagement Rate', 'Followers', 'Audience Authenticity']]], ignore_index=True)
    y = pd.concat([y, pd.Series([1 - y.iloc[0]] * num_missing_samples)], ignore_index=True)

# Step 5: Handle Imbalanced Data
if len(y.unique()) > 1 and y.value_counts().min() > 1:  # Ensure SMOTE has enough samples to work with
    smote = SMOTE(random_state=42, k_neighbors=min(5, y.value_counts().min() - 1))
    X_resampled, y_resampled = smote.fit_resample(X, y)
else:
    X_resampled, y_resampled = X, y

# Step 6: Split Data
X_train, X_test, y_train, y_test = train_test_split(X_resampled, y_resampled, test_size=0.2, random_state=42)

# Step 7: Data Scaling
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Step 8: Train XGBoost Classifier with Hyperparameter Tuning
if len(y_train.unique()) > 1:
    model = XGBClassifier(eval_metric='logloss', use_label_encoder=False)

    # Define hyperparameter grid for GridSearchCV
    param_grid = {
        'max_depth': [3, 5, 7],
        'learning_rate': [0.01, 0.1, 0.2],
        'n_estimators': [50, 100, 200],
        'min_child_weight': [1, 3, 5],
        'subsample': [0.8, 1],
        'colsample_bytree': [0.8, 1]
    }

    # GridSearchCV for hyperparameter tuning
    grid_search = GridSearchCV(estimator=model, param_grid=param_grid, cv=3, scoring='accuracy', verbose=0, n_jobs=-1)
    grid_search.fit(X_train_scaled, y_train)
    best_model = grid_search.best_estimator_

    # Step 9: Cross-validation with StratifiedKFold
    skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
    cv_scores = cross_val_score(best_model, X_train_scaled, y_train, cv=skf, scoring='accuracy')
    print(f"Cross-validation Accuracy: {cv_scores.mean():.2f} (+/- {cv_scores.std():.2f})")

    # Step 10: Predictions
    y_pred = best_model.predict(X_test_scaled)

    # Step 11: Evaluation
    accuracy = accuracy_score(y_test, y_pred)
    report = classification_report(y_test, y_pred)

    print(f"Test Accuracy: {accuracy:.2f}")
    print("Classification Report:\n", report)
else:
    print("Model training skipped due to insufficient class variation.")

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encode

Cross-validation Accuracy: 0.99 (+/- 0.01)
Test Accuracy: 0.99
Classification Report:
               precision    recall  f1-score   support

           0       0.98      1.00      0.99        43
           1       1.00      0.98      0.99        46

    accuracy                           0.99        89
   macro avg       0.99      0.99      0.99        89
weighted avg       0.99      0.99      0.99        89



In [59]:
# Importing necessary libraries
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, cross_val_score, StratifiedKFold, GridSearchCV
from sklearn.preprocessing import StandardScaler
from imblearn.over_sampling import SMOTE
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score, classification_report
import warnings

# Suppress warnings from XGBoost
warnings.filterwarnings('ignore', category=UserWarning, message='.*use_label_encoder.*')
warnings.filterwarnings('ignore', category=UserWarning, message='.*learner.cc:740.*')

# Step 1: Load the Data
file_path = 'MilletInfluencers.csv'
data = pd.read_csv(file_path)

# Step 2: Data Cleaning
# Drop rows with missing essential values (e.g., Followers, Ave Video Views)
data.dropna(subset=['Followers', 'Ave Video Views', 'Engagement Rate'], inplace=True)

# Convert engagement rate to numerical if needed
if data['Engagement Rate'].dtype == 'object':
    data['Engagement Rate'] = data['Engagement Rate'].str.rstrip('%').astype('float') / 100

# Convert Followers and Ave Video Views to numerical if they are not already
if data['Followers'].dtype == 'object':
    data['Followers'] = data['Followers'].str.replace(',', '').astype('int')
if data['Ave Video Views'].dtype == 'object':
    data['Ave Video Views'] = data['Ave Video Views'].str.replace(',', '').astype('int')

# Step 3: Feature Engineering
# Adding a feature for `Audience Authenticity` (dummy feature for now as no real data)
data['Audience Authenticity'] = np.random.rand(len(data))

# Step 4: Prepare Features and Labels
# Features (X) and Target (y)
X = data[['Ave Video Views', 'Engagement Rate', 'Followers', 'Audience Authenticity']]

# Ensure Credit Score has more than one unique value
y = data['Credit Score'].apply(lambda x: 1 if x > 80 else 0)  # Binary target for classification (High Value: 1, Others: 0)

# Ensure that y has at least one sample of each class
if len(y.unique()) < 2:
    # Add more dummy rows with the opposite class to ensure both classes are present
    num_missing_samples = 5  # Adding more samples to ensure class representation
    dummy_rows = pd.concat([data.iloc[0:1].copy()] * num_missing_samples, ignore_index=True)
    dummy_rows['Credit Score'] = 0 if y.iloc[0] == 1 else 100
    X = pd.concat([X, dummy_rows[['Ave Video Views', 'Engagement Rate', 'Followers', 'Audience Authenticity']]], ignore_index=True)
    y = pd.concat([y, pd.Series([1 - y.iloc[0]] * num_missing_samples)], ignore_index=True)

# Step 5: Handle Imbalanced Data
if len(y.unique()) > 1 and y.value_counts().min() > 1:  # Ensure SMOTE has enough samples to work with
    smote = SMOTE(random_state=42, k_neighbors=min(5, y.value_counts().min() - 1))
    X_resampled, y_resampled = smote.fit_resample(X, y)
else:
    X_resampled, y_resampled = X, y

# Step 6: Split Data
X_train, X_test, y_train, y_test = train_test_split(X_resampled, y_resampled, test_size=0.2, random_state=42)

# Step 7: Data Scaling
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Step 8: Train XGBoost Classifier with Hyperparameter Tuning
if len(y_train.unique()) > 1:
    model = XGBClassifier(eval_metric='logloss', use_label_encoder=False, verbosity=0)

    # Define hyperparameter grid for GridSearchCV
    param_grid = {
        'max_depth': [3, 5, 7],
        'learning_rate': [0.01, 0.1, 0.2],
        'n_estimators': [50, 100, 200],
        'min_child_weight': [1, 3, 5],
        'subsample': [0.8, 1],
        'colsample_bytree': [0.8, 1]
    }

    # GridSearchCV for hyperparameter tuning
    grid_search = GridSearchCV(estimator=model, param_grid=param_grid, cv=3, scoring='accuracy', verbose=0, n_jobs=-1)
    grid_search.fit(X_train_scaled, y_train)
    best_model = grid_search.best_estimator_

    # Step 9: Cross-validation with StratifiedKFold
    skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
    cv_scores = cross_val_score(best_model, X_train_scaled, y_train, cv=skf, scoring='accuracy')
    print(f"Cross-validation Accuracy: {cv_scores.mean():.2f} (+/- {cv_scores.std():.2f})")

    # Step 10: Predictions
    y_pred = best_model.predict(X_test_scaled)

    # Step 11: Evaluation
    accuracy = accuracy_score(y_test, y_pred)
    report = classification_report(y_test, y_pred)

    print(f"Test Accuracy: {accuracy:.2f}")
    print("Classification Report:\n", report)

    # Step 12: Print User ID, Influencer Name, and Credit Score
    influencer_info = data[['unique id', 'TikTok Handle', 'Credit Score']]
    print(influencer_info)
else:
    print("Model training skipped due to insufficient class variation.")

Cross-validation Accuracy: 0.99 (+/- 0.01)
Test Accuracy: 0.99
Classification Report:
               precision    recall  f1-score   support

           0       0.98      1.00      0.99        43
           1       1.00      0.98      0.99        46

    accuracy                           0.99        89
   macro avg       0.99      0.99      0.99        89
weighted avg       0.99      0.99      0.99        89

                            unique id    TikTok Handle  Credit Score
0    1725757388105x534183512775877800    kendrawilsonn           100
1    1726285815736x292117056471620500   dajshaltaylorr           100
2    1726285815737x125782983637192210       gokrazy.dj           100
3    1726285815737x306484531832185860       rose_12477           100
4    1726285815737x751439652451118600      peachynanii           100
..                                ...              ...           ...
217  1727039832535x385837889447060860    tricianelson5           100
218  1727039832536x129026105100214

In [74]:
# Importing necessary libraries
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, cross_val_score, StratifiedKFold, GridSearchCV
from sklearn.preprocessing import StandardScaler
from imblearn.over_sampling import SMOTE
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score, classification_report
import warnings

# Suppress warnings from XGBoost
warnings.filterwarnings('ignore', category=UserWarning, message='.*use_label_encoder.*')
warnings.filterwarnings('ignore', category=UserWarning, message='.*learner.cc:740.*')

# Step 1: Load the Data
file_path = 'MilletInfluencers.csv'
data = pd.read_csv(file_path)

# Step 2: Data Cleaning
# Drop rows with missing essential values (e.g., Followers, Ave Video Views)
data.dropna(subset=['Followers', 'Ave Video Views', 'Engagement Rate'], inplace=True)

# Convert engagement rate to numerical if needed
if data['Engagement Rate'].dtype == 'object':
    data['Engagement Rate'] = data['Engagement Rate'].str.rstrip('%').astype('float') / 100

# Convert Followers and Ave Video Views to numerical if they are not already
if data['Followers'].dtype == 'object':
    data['Followers'] = data['Followers'].str.replace(',', '').astype('int')
if data['Ave Video Views'].dtype == 'object':
    data['Ave Video Views'] = data['Ave Video Views'].str.replace(',', '').astype('int')

# Step 3: Feature Engineering
# Step 3: Feature Engineering
# Adding a feature for `Audience Authenticity` based on engagement and followers
# Audience Authenticity will be calculated as a ratio of engagement rate to followers to simulate authenticity
# with some added randomness to simulate real-world variation.
data['Audience Authenticity'] = (data['Engagement Rate'] * 0.5 + np.random.rand(len(data)) * 0.5) / (data['Followers'] + 1)

# Step 4: Prepare Features and Labels
# Features (X) and Target (y)
X = data[['Ave Video Views', 'Engagement Rate', 'Followers', 'Audience Authenticity']]

# Ensure Credit Score has more than one unique value
# Step 4: Modify Credit Score Calculation
# Calculate credit score based on multiple metrics such as engagement rate, followers, and audience authenticity
# This will result in a score from 0 to 100 where higher engagement, authenticity, and follower count contribute positively.
data['Credit Score'] = (
    (data['Engagement Rate'] * 40) +    # Give high weight to engagement rate
    ((data['Followers'] / data['Followers'].max()) * 20) +  # Give moderate weight to follower count
    ((data['Ave Video Views'] / data['Ave Video Views'].max()) * 40)  # Give high weight to average video views
).clip(0, 100).astype(int)

# Convert credit score to binary target for classification
y = data['Credit Score'].apply(lambda x: 1 if x > 50 else 0)  # Binary target for classification (High Value: 1, Others: 0)

# Ensure that y has at least one sample of each class
if len(y.unique()) < 2:
    # Add more dummy rows with the opposite class to ensure both classes are present
    num_missing_samples = 5  # Adding more samples to ensure class representation
    dummy_rows = pd.concat([data.iloc[0:1].copy()] * num_missing_samples, ignore_index=True)
    dummy_rows['Credit Score'] = 0 if y.iloc[0] == 1 else 100
    X = pd.concat([X, dummy_rows[['Ave Video Views', 'Engagement Rate', 'Followers', 'Audience Authenticity']]], ignore_index=True)
    y = pd.concat([y, pd.Series([1 - y.iloc[0]] * num_missing_samples)], ignore_index=True)

# Step 5: Handle Imbalanced Data
if len(y.unique()) > 1 and y.value_counts().min() > 1:  # Ensure SMOTE has enough samples to work with
    smote = SMOTE(random_state=42, k_neighbors=min(5, y.value_counts().min() - 1))
    X_resampled, y_resampled = smote.fit_resample(X, y)
else:
    X_resampled, y_resampled = X, y

# Step 6: Split Data
X_train, X_test, y_train, y_test = train_test_split(X_resampled, y_resampled, test_size=0.2, random_state=42)

# Step 7: Data Scaling
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Step 8: Train XGBoost Classifier with Hyperparameter Tuning
if len(y_train.unique()) > 1:
    model = XGBClassifier(eval_metric='logloss', use_label_encoder=False, verbosity=0)

    # Define hyperparameter grid for GridSearchCV
    param_grid = {
        'max_depth': [3, 5, 7],
        'learning_rate': [0.01, 0.1, 0.2],
        'n_estimators': [50, 100, 200],
        'min_child_weight': [1, 3, 5],
        'subsample': [0.8, 1],
        'colsample_bytree': [0.8, 1]
    }

    # GridSearchCV for hyperparameter tuning
    grid_search = GridSearchCV(estimator=model, param_grid=param_grid, cv=3, scoring='accuracy', verbose=0, n_jobs=-1)
    grid_search.fit(X_train_scaled, y_train)
    best_model = grid_search.best_estimator_

    # Step 9: Cross-validation with StratifiedKFold
    skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
    
    print(f"All Cross-validation Scores: {cv_scores}")
    print(f"Cross-validation Accuracy: {cv_scores.mean():.2f} (+/- {cv_scores.std():.2f})")

    # Step 10: Predictions
    y_pred = best_model.predict(X_test_scaled)

    # Step 11: Evaluation
    accuracy = accuracy_score(y_test, y_pred)
    report = classification_report(y_test, y_pred)

    print(f"Test Accuracy: {accuracy:.2f}")
    print("Classification Report:\n", report)

    # Step 12: Print User ID, Influencer Name, and Credit Score
    influencer_info = data[['unique id', 'TikTok Handle', 'Credit Score']]
    print(influencer_info)

    # Step 13: Save output to CSV
    output_file_path = 'influencer_credit_scores_output.csv'
    influencer_info.to_csv(output_file_path, index=False)
    print(f'Output saved to {output_file_path}')
else:
    print("Model training skipped due to insufficient class variation.")

All Cross-validation Scores: [0.97183099 1.         1.         1.         1.        ]
Cross-validation Accuracy: 0.99 (+/- 0.01)
Test Accuracy: 0.99
Classification Report:
               precision    recall  f1-score   support

           0       1.00      0.98      0.99        46
           1       0.98      1.00      0.99        43

    accuracy                           0.99        89
   macro avg       0.99      0.99      0.99        89
weighted avg       0.99      0.99      0.99        89

                            unique id    TikTok Handle  Credit Score
0    1725757388105x534183512775877800    kendrawilsonn             6
1    1726285815736x292117056471620500   dajshaltaylorr            11
2    1726285815737x125782983637192210       gokrazy.dj            13
3    1726285815737x306484531832185860       rose_12477            11
4    1726285815737x751439652451118600      peachynanii            11
..                                ...              ...           ...
217  172703983253

In [84]:
import pandas as pd

# Load your data into a pandas DataFrame
data = pd.read_csv('MilletInfluencers.csv')

# Select relevant columns
df = data[['Ave Video Views', 'Engagement Rate', 'Followers']].copy()

# Data Cleaning

# 1. Clean 'Engagement Rate' column
df['Engagement Rate'] = df['Engagement Rate'].astype(str).str.rstrip('%').astype(float) / 100.0

# 2. Clean 'Ave Video Views' and 'Followers' columns

# Convert columns to strings
df['Ave Video Views'] = df['Ave Video Views'].astype(str)
df['Followers'] = df['Followers'].astype(str)

# Remove commas and convert to numeric
df['Ave Video Views'] = df['Ave Video Views'].str.replace(',', '', regex=False)
df['Followers'] = df['Followers'].str.replace(',', '', regex=False)

# Convert to float
df['Ave Video Views'] = df['Ave Video Views'].astype(float)
df['Followers'] = df['Followers'].astype(float)

# Handle any NaN values (if any entries couldn't be converted)
df.dropna(subset=['Engagement Rate', 'Ave Video Views', 'Followers'], inplace=True)

# Normalize the metrics using Min-Max Scaling
def normalize(series):
    return (series - series.min()) / (series.max() - series.min())

df['Normalized Engagement Rate'] = normalize(df['Engagement Rate'])
df['Normalized Ave Video Views'] = normalize(df['Ave Video Views'])
df['Normalized Followers'] = normalize(df['Followers'])

# Assign weights
w1 = 0.5  # Engagement Rate
w2 = 0.3  # Average Video Views
w3 = 0.2  # Followers

# Calculate Credit Score
df['Credit Score'] = (
    (w1 * df['Normalized Engagement Rate']) +
    (w2 * df['Normalized Ave Video Views']) +
    (w3 * df['Normalized Followers'])
) * 100  # Scale to 0-100

# Round Credit Score to 2 decimal places
df['Credit Score'] = df['Credit Score'].round(2)

# Merge Credit Score back into original data
data['Credit Score'] = df['Credit Score']

# Save the updated data
data.to_csv('influencer_data_with_credit_scores.csv', index=False)

# Display the influencers with their calculated credit scores
print(data[['TikTok Username', 'Credit Score']])


        TikTok Username  Credit Score
0                   Zak         12.92
1              Dajsha ♡         41.83
2                     5         43.21
3                🌹Rose🌹         41.80
4                 bells         41.95
..                  ...           ...
217       Tricia Nelson         24.26
218         slaybymunaa         14.74
219      Sophia Giuffré          2.90
220  Perfectly balanced         50.62
221           emmilllyy         13.65

[222 rows x 2 columns]


In [83]:
import pandas as pd

# Load your data into a pandas DataFrame
data = pd.read_csv('MilletInfluencers.csv')

# Select relevant columns
df = data[['Ave Video Views', 'Engagement Rate', 'Followers']].copy()

# Data Cleaning

# 1. Clean 'Engagement Rate' column
df['Engagement Rate'] = df['Engagement Rate'].astype(str).str.rstrip('%').astype(float) / 100.0

# 2. Clean 'Ave Video Views' and 'Followers' columns

# Convert columns to strings
df['Ave Video Views'] = df['Ave Video Views'].astype(str)
df['Followers'] = df['Followers'].astype(str)

# Remove commas and convert to numeric
df['Ave Video Views'] = df['Ave Video Views'].str.replace(',', '', regex=False)
df['Followers'] = df['Followers'].str.replace(',', '', regex=False)

# Convert to float
df['Ave Video Views'] = df['Ave Video Views'].astype(float)
df['Followers'] = df['Followers'].astype(float)

# Handle any NaN values (if any entries couldn't be converted)
df.dropna(subset=['Engagement Rate', 'Ave Video Views', 'Followers'], inplace=True)

# Normalize the metrics using Min-Max Scaling
def normalize(series):
    return (series - series.min()) / (series.max() - series.min())

df['Normalized Engagement Rate'] = normalize(df['Engagement Rate'])
df['Normalized Ave Video Views'] = normalize(df['Ave Video Views'])
df['Normalized Followers'] = normalize(df['Followers'])

# Assign weights
w1 = 0.5  # Engagement Rate
w2 = 0.3  # Average Video Views
w3 = 0.2  # Followers

# Calculate Weighted Sum
df['Weighted Sum'] = (
    (w1 * df['Normalized Engagement Rate']) +
    (w2 * df['Normalized Ave Video Views']) +
    (w3 * df['Normalized Followers'])
)

# Adjust the Credit Score to have a minimum of 70
df['Credit Score'] = df['Weighted Sum'] * 30 + 70  # Scale to 70-100

# Round Credit Score to 2 decimal places
df['Credit Score'] = df['Credit Score'].round(2)

# Merge Credit Score back into original data
data['Credit Score'] = df['Credit Score']

# Sort data by 'Credit Score' in ascending order
data_sorted = data.sort_values(by='Credit Score', ascending=True)

# Select the desired columns
output = data_sorted[['unique id', 'TikTok Username', 'Credit Score']]

# Reset index for neatness (optional)
output = output.reset_index(drop=True)

# Save the output DataFrame to a CSV file
output.to_csv('Curved credit scores.csv', index=False)

# Print confirmation
print("Output saved to 'Curved credit scores.csv'")


Output saved to 'Curved credit scores.csv'
