In [44]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer

In [45]:
# Read the CSV file into a DataFrame
data = pd.read_csv('Full Team.csv')

# Extract relevant columns
team_data = data[['Name', 'Linkedin Info']]

# Define the role descriptions
role_descriptions = {
    'Manager': ['manager', 'lead', 'director'],
    'Developer': ['developer', 'programmer', 'software engineer'],
    'Marketing': ['marketing', 'digital marketing', 'brand'],
    'Designer': ['designer', 'graphic designer']
}

In [46]:
# Create the TF-IDF vectorizer and fit it on the role descriptions
tfidf = TfidfVectorizer(vocabulary=[keyword for keywords in role_descriptions.values() for keyword in keywords])
tfidf.fit([' '.join(keywords) for keywords in role_descriptions.values()])

# Function to assign the role based on similarity with role descriptions
def assign_role(linkedin_info):
    if pd.isnull(linkedin_info):
        return 'Other'
    info = str(linkedin_info).lower()
    info_vector = tfidf.transform([info]).toarray()
    role_vectors = tfidf.transform([' '.join(keywords) for keywords in role_descriptions.values()]).toarray()
    similarity_scores = np.dot(info_vector, role_vectors.T).ravel()
    max_sim_index = np.argmax(similarity_scores)
    return list(role_descriptions.keys())[max_sim_index]

# Apply the function to the 'Linkedin Info' column
team_data['Role'] = team_data['Linkedin Info'].apply(assign_role)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  team_data['Role'] = team_data['Linkedin Info'].apply(assign_role)


In [52]:
# Count the number of occurrences for each role
role_counts = team_data['Role'].value_counts()
role_counts

Manager      97
Marketing     8
Developer     4
Designer      2
Name: Role, dtype: int64

In [47]:
# Apply the function to the 'Linkedin Info' column
team_data['Role'] = team_data['Linkedin Info'].apply(assign_role)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  team_data['Role'] = team_data['Linkedin Info'].apply(assign_role)


In [53]:
# Create a bar plot using Plotly
fig = go.Figure(data=[go.Bar(x=role_counts.index, y=role_counts.values)])
fig.update_layout(
    title="Distribution of Roles in FamPay Team",
    xaxis_title="Role",
    yaxis_title="Count",
    xaxis=dict(tickangle=45),
)
fig.show()