In [9]:
import numpy as np
import pandas as pd

# Define the ranges and options for each feature
profile_features = {
    'Education': ['BS', 'MS', 'PhD'],  # Education levels
    'Experience (Years)': [0, 10],  # Integer years
    'Experience (Months)': [0, 11],  # Remaining months in a year
    'Connections': [1, 500],  # Number of connections
    'Recent Post (Months Ago)': [1, 12],  # Time since last post
    'Reactions on Recent Post': [0, 1000],  # Number of reactions
    'Comments on Recent Post': [10, 300],  # Number of comments
    'Repost Count on Recent Post': [0, 20]  # Number of reposts
}

# Function to generate synthetic LinkedIn profiles
def generate_synthetic_profiles(num_samples):
    data = {
        'Education': np.random.choice(profile_features['Education'], num_samples, p=[0.5, 0.3, 0.2]),
        'Experience (Years)': np.random.randint(*profile_features['Experience (Years)'], num_samples),
        'Experience (Months)': np.random.randint(*profile_features['Experience (Months)'], num_samples),
        'Connections': np.random.randint(*profile_features['Connections'], num_samples),
        'Recent Post (Months Ago)': np.random.randint(*profile_features['Recent Post (Months Ago)'], num_samples),
        'Reactions on Recent Post': np.random.randint(*profile_features['Reactions on Recent Post'], num_samples),
        'Comments on Recent Post': np.random.randint(*profile_features['Comments on Recent Post'], num_samples),
        'Repost Count on Recent Post': np.random.randint(*profile_features['Repost Count on Recent Post'], num_samples)
    }
    return pd.DataFrame(data)

# Function to classify profiles into Class A, B, or C
# Function to classify profiles into Class A, B, or C
def classify_profiles(df):
    categories = []
    scores = []  # To store scores for each profile
    for _, row in df.iterrows():
        score = 0
        
        # Scoring criteria for Education
        if row['Education'] == 'PhD':
            score += 10
        elif row['Education'] == 'MS':
            score += 6
        elif row['Education'] == 'BS':
            score += 3

        # Scoring criteria for Experience
        if row['Experience (Years)'] >= 10:
            score += 10
        elif row['Experience (Years)'] >= 5:
            score += 7
        elif row['Experience (Years)'] >= 2:
            score += 3
        else:
            score += 0

        # Scoring criteria for Connections
        if row['Connections'] >= 400:
            score += 10
        elif row['Connections'] >= 200:
            score += 7
        elif row['Connections'] >= 50:
            score += 1
        else:
            score += 0

        # Scoring criteria for Recent Post (Months Ago)
        if row['Recent Post (Months Ago)'] <= 1:
            score += 10
        elif row['Recent Post (Months Ago)'] <= 3:
            score += 7
        elif row['Recent Post (Months Ago)'] <= 6:
            score += 1
        else:
            score += 0

        # Scoring criteria for Reactions on Recent Post
        if row['Reactions on Recent Post'] >= 500:
            score += 10
        elif row['Reactions on Recent Post'] >= 150:
            score += 8
        elif row['Reactions on Recent Post'] >= 20:
            score += 1
        else:
            score += 0

        # Scoring criteria for Comments on Recent Post
        if row['Comments on Recent Post'] >= 200:
            score += 10
        elif row['Comments on Recent Post'] >= 80:
            score += 8
        elif row['Comments on Recent Post'] >= 20:
            score += 3
        else:
            score += 0

        # Assign categories based on total score
        if score >= 45:  # Class A threshold
            categories.append('Class A')
        elif score >= 30:  # Class B threshold
            categories.append('Class B')
        elif score >= 15:  # Remaining are Class C
            categories.append('Class C')
        else:
            categories.append('Class C')  # Default to Class C for very low scores

        # Append score to scores list
        scores.append(score)

    # Add categories and scores to the DataFrame
    df['Category'] = categories
    df['Score'] = scores
    return df

# Generate synthetic LinkedIn profile data
synthetic_profiles = generate_synthetic_profiles(500)

# Classify the profiles
classified_profiles = classify_profiles(synthetic_profiles)

# Display the first few rows
print(classified_profiles.head())

# Save the dataset to a CSV file (optional)
classified_profiles.to_csv('linkedin_profiles_with_categories.csv', index=False)

output_file = "synthetic_profiles_classified.csv"
classified_profiles.to_csv(output_file, index=False)

output_file


  Education  Experience (Years)  Experience (Months)  Connections  \
0        BS                   4                    3          395   
1        BS                   1                    6          377   
2        BS                   5                    6          153   
3        BS                   0                    4          208   
4       PhD                   8                    6          306   

   Recent Post (Months Ago)  Reactions on Recent Post  \
0                         9                       491   
1                         5                       372   
2                        11                       145   
3                         7                       524   
4                         2                        23   

   Comments on Recent Post  Repost Count on Recent Post Category  Score  
0                       37                            1  Class C     24  
1                      146                           14  Class C     27  
2                   

'synthetic_profiles_classified.csv'

In [4]:
data = pd.read_csv('linkedin_profiles_with_categories.csv')
data[data['Category']== 'Class A'].info()

<class 'pandas.core.frame.DataFrame'>
Index: 28 entries, 0 to 492
Data columns (total 10 columns):
 #   Column                       Non-Null Count  Dtype 
---  ------                       --------------  ----- 
 0   Education                    28 non-null     object
 1   Experience (Years)           28 non-null     int64 
 2   Experience (Months)          28 non-null     int64 
 3   Connections                  28 non-null     int64 
 4   Recent Post (Months Ago)     28 non-null     int64 
 5   Reactions on Recent Post     28 non-null     int64 
 6   Comments on Recent Post      28 non-null     int64 
 7   Repost Count on Recent Post  28 non-null     int64 
 8   Category                     28 non-null     object
 9   Score                        28 non-null     int64 
dtypes: int64(8), object(2)
memory usage: 2.4+ KB


In [5]:
data[data['Category']== 'Class B'].info()

<class 'pandas.core.frame.DataFrame'>
Index: 306 entries, 5 to 499
Data columns (total 10 columns):
 #   Column                       Non-Null Count  Dtype 
---  ------                       --------------  ----- 
 0   Education                    306 non-null    object
 1   Experience (Years)           306 non-null    int64 
 2   Experience (Months)          306 non-null    int64 
 3   Connections                  306 non-null    int64 
 4   Recent Post (Months Ago)     306 non-null    int64 
 5   Reactions on Recent Post     306 non-null    int64 
 6   Comments on Recent Post      306 non-null    int64 
 7   Repost Count on Recent Post  306 non-null    int64 
 8   Category                     306 non-null    object
 9   Score                        306 non-null    int64 
dtypes: int64(8), object(2)
memory usage: 26.3+ KB


In [6]:
data = pd.read_csv('linkedin_profiles_with_categories.csv')
data[data['Category']== 'Class C'].info()

<class 'pandas.core.frame.DataFrame'>
Index: 166 entries, 1 to 495
Data columns (total 10 columns):
 #   Column                       Non-Null Count  Dtype 
---  ------                       --------------  ----- 
 0   Education                    166 non-null    object
 1   Experience (Years)           166 non-null    int64 
 2   Experience (Months)          166 non-null    int64 
 3   Connections                  166 non-null    int64 
 4   Recent Post (Months Ago)     166 non-null    int64 
 5   Reactions on Recent Post     166 non-null    int64 
 6   Comments on Recent Post      166 non-null    int64 
 7   Repost Count on Recent Post  166 non-null    int64 
 8   Category                     166 non-null    object
 9   Score                        166 non-null    int64 
dtypes: int64(8), object(2)
memory usage: 14.3+ KB


In [7]:
data.nunique()

Education                        3
Experience (Years)              10
Experience (Months)             11
Connections                    318
Recent Post (Months Ago)        11
Reactions on Recent Post       377
Comments on Recent Post        236
Repost Count on Recent Post     20
Category                         3
Score                           39
dtype: int64