In [1]:
import numpy as np
import pandas as pd

# Define the ranges and options for each feature
profile_features = {
    'Education': ['BS', 'MS', 'PhD'],  # Education levels
    'Experience (Years)': [0, 10],  # Integer years
    'Experience (Months)': [0, 11],  # Remaining months in a year
    'Connections': [1, 500],  # Number of connections
    'Recent Post (Months Ago)': [1, 12],  # Time since last post
    'Reactions on Recent Post': [0, 1000],  # Number of reactions
    'Comments on Recent Post': [10, 300],  # Number of comments
    'Repost Count on Recent Post': [0, 20]  # Number of reposts
}

# Function to generate synthetic LinkedIn profiles
def generate_synthetic_profiles(num_samples):
    data = {
        'Education': np.random.choice(profile_features['Education'], num_samples, p=[0.5, 0.3, 0.2]),
        'Experience (Years)': np.random.randint(*profile_features['Experience (Years)'], num_samples),
        'Experience (Months)': np.random.randint(*profile_features['Experience (Months)'], num_samples),
        'Connections': np.random.randint(*profile_features['Connections'], num_samples),
        'Recent Post (Months Ago)': np.random.randint(*profile_features['Recent Post (Months Ago)'], num_samples),
        'Reactions on Recent Post': np.random.randint(*profile_features['Reactions on Recent Post'], num_samples),
        'Comments on Recent Post': np.random.randint(*profile_features['Comments on Recent Post'], num_samples),
        'Repost Count on Recent Post': np.random.randint(*profile_features['Repost Count on Recent Post'], num_samples)
    }
    return pd.DataFrame(data)

# Function to classify profiles into Class A, B, or C
# Function to classify profiles into Class A, B, or C
def classify_profiles(df):
    categories = []
    scores = []  # To store scores for each profile
    for _, row in df.iterrows():
        score = 0
        
        # Scoring criteria for Education
        if row['Education'] == 'PhD':
            score += 10
        elif row['Education'] == 'MS':
            score += 6
        elif row['Education'] == 'BS':
            score += 3

        # Scoring criteria for Experience
        if row['Experience (Years)'] >= 10:
            score += 10
        elif row['Experience (Years)'] >= 5:
            score += 7
        elif row['Experience (Years)'] >= 2:
            score += 3
        else:
            score += 0

        # Scoring criteria for Connections
        if row['Connections'] >= 400:
            score += 10
        elif row['Connections'] >= 200:
            score += 7
        elif row['Connections'] >= 50:
            score += 1
        else:
            score += 0

        # Scoring criteria for Recent Post (Months Ago)
        if row['Recent Post (Months Ago)'] <= 1:
            score += 10
        elif row['Recent Post (Months Ago)'] <= 3:
            score += 7
        elif row['Recent Post (Months Ago)'] <= 6:
            score += 1
        else:
            score += 0

        # Scoring criteria for Reactions on Recent Post
        if row['Reactions on Recent Post'] >= 500:
            score += 10
        elif row['Reactions on Recent Post'] >= 150:
            score += 8
        elif row['Reactions on Recent Post'] >= 20:
            score += 1
        else:
            score += 0

        # Scoring criteria for Comments on Recent Post
        if row['Comments on Recent Post'] >= 200:
            score += 10
        elif row['Comments on Recent Post'] >= 80:
            score += 8
        elif row['Comments on Recent Post'] >= 20:
            score += 3
        else:
            score += 0

        # Assign categories based on total score
        if score >= 45:  # Class A threshold
            categories.append('Class A')
        elif score >= 30:  # Class B threshold
            categories.append('Class B')
        elif score >= 15:  # Remaining are Class C
            categories.append('Class C')
        else:
            categories.append('Class C')  # Default to Class C for very low scores

        # Append score to scores list
        scores.append(score)

    # Add categories and scores to the DataFrame
    df['Category'] = categories
    df['Score'] = scores
    return df

# Generate synthetic LinkedIn profile data
synthetic_profiles = generate_synthetic_profiles(500)

# Classify the profiles
classified_profiles = classify_profiles(synthetic_profiles)

# Display the first few rows
print(classified_profiles.head())

# Save the dataset to a CSV file (optional)
classified_profiles.to_csv('linkedin_profiles_with_categories.csv', index=False)

output_file = "/mnt/data/synthetic_profiles_classified.csv"
classified_profiles.to_csv(output_file, index=False)

output_file


  Education  Experience (Years)  Experience (Months)  Connections  \
0        MS                   5                    9           13   
1        BS                   6                    1          109   
2        MS                   0                    5          358   
3       PhD                   5                    1            8   
4       PhD                   7                    9          326   

   Recent Post (Months Ago)  Reactions on Recent Post  \
0                         8                       492   
1                         3                       515   
2                         4                       932   
3                         3                       915   
4                         6                       941   

   Comments on Recent Post  Repost Count on Recent Post Category  Score  
0                      259                           10  Class B     31  
1                       61                           16  Class B     31  
2                   

OSError: Cannot save file into a non-existent directory: '\mnt\data'

In [104]:
data = pd.read_csv('linkedin_profiles_with_categories.csv')
data[data['Category']== 'Class A'].info()

<class 'pandas.core.frame.DataFrame'>
Index: 30 entries, 0 to 484
Data columns (total 10 columns):
 #   Column                       Non-Null Count  Dtype 
---  ------                       --------------  ----- 
 0   Education                    30 non-null     object
 1   Experience (Years)           30 non-null     int64 
 2   Experience (Months)          30 non-null     int64 
 3   Connections                  30 non-null     int64 
 4   Recent Post (Months Ago)     30 non-null     int64 
 5   Reactions on Recent Post     30 non-null     int64 
 6   Comments on Recent Post      30 non-null     int64 
 7   Repost Count on Recent Post  30 non-null     int64 
 8   Category                     30 non-null     object
 9   Score                        30 non-null     int64 
dtypes: int64(8), object(2)
memory usage: 2.6+ KB


In [105]:
data[data['Category']== 'Class B'].info()

<class 'pandas.core.frame.DataFrame'>
Index: 305 entries, 1 to 498
Data columns (total 10 columns):
 #   Column                       Non-Null Count  Dtype 
---  ------                       --------------  ----- 
 0   Education                    305 non-null    object
 1   Experience (Years)           305 non-null    int64 
 2   Experience (Months)          305 non-null    int64 
 3   Connections                  305 non-null    int64 
 4   Recent Post (Months Ago)     305 non-null    int64 
 5   Reactions on Recent Post     305 non-null    int64 
 6   Comments on Recent Post      305 non-null    int64 
 7   Repost Count on Recent Post  305 non-null    int64 
 8   Category                     305 non-null    object
 9   Score                        305 non-null    int64 
dtypes: int64(8), object(2)
memory usage: 26.2+ KB


In [106]:
data = pd.read_csv('linkedin_profiles_with_categories.csv')
data[data['Category']== 'Class C'].info()

<class 'pandas.core.frame.DataFrame'>
Index: 165 entries, 7 to 499
Data columns (total 10 columns):
 #   Column                       Non-Null Count  Dtype 
---  ------                       --------------  ----- 
 0   Education                    165 non-null    object
 1   Experience (Years)           165 non-null    int64 
 2   Experience (Months)          165 non-null    int64 
 3   Connections                  165 non-null    int64 
 4   Recent Post (Months Ago)     165 non-null    int64 
 5   Reactions on Recent Post     165 non-null    int64 
 6   Comments on Recent Post      165 non-null    int64 
 7   Repost Count on Recent Post  165 non-null    int64 
 8   Category                     165 non-null    object
 9   Score                        165 non-null    int64 
dtypes: int64(8), object(2)
memory usage: 14.2+ KB


In [107]:
data.nunique()

Education                        3
Experience (Years)              10
Experience (Months)             11
Connections                    323
Recent Post (Months Ago)        11
Reactions on Recent Post       386
Comments on Recent Post        222
Repost Count on Recent Post     20
Category                         3
Score                           42
dtype: int64