In [1]:
import pandas as pd
import numpy as np
import seaborn as sn

In [2]:
df = pd.read_csv('Uncleaned_DS_jobs.csv')

In [3]:
df.head(5)

Unnamed: 0,index,Job Title,Salary Estimate,Job Description,Rating,Company Name,Location,Headquarters,Size,Founded,Type of ownership,Industry,Sector,Revenue,Competitors
0,0,Sr Data Scientist,$137K-$171K (Glassdoor est.),Description\n\nThe Senior Data Scientist is re...,3.1,Healthfirst\n3.1,"New York, NY","New York, NY",1001 to 5000 employees,1993,Nonprofit Organization,Insurance Carriers,Insurance,Unknown / Non-Applicable,"EmblemHealth, UnitedHealth Group, Aetna"
1,1,Data Scientist,$137K-$171K (Glassdoor est.),"Secure our Nation, Ignite your Future\n\nJoin ...",4.2,ManTech\n4.2,"Chantilly, VA","Herndon, VA",5001 to 10000 employees,1968,Company - Public,Research & Development,Business Services,$1 to $2 billion (USD),-1
2,2,Data Scientist,$137K-$171K (Glassdoor est.),Overview\n\n\nAnalysis Group is one of the lar...,3.8,Analysis Group\n3.8,"Boston, MA","Boston, MA",1001 to 5000 employees,1981,Private Practice / Firm,Consulting,Business Services,$100 to $500 million (USD),-1
3,3,Data Scientist,$137K-$171K (Glassdoor est.),JOB DESCRIPTION:\n\nDo you have a passion for ...,3.5,INFICON\n3.5,"Newton, MA","Bad Ragaz, Switzerland",501 to 1000 employees,2000,Company - Public,Electrical & Electronic Manufacturing,Manufacturing,$100 to $500 million (USD),"MKS Instruments, Pfeiffer Vacuum, Agilent Tech..."
4,4,Data Scientist,$137K-$171K (Glassdoor est.),Data Scientist\nAffinity Solutions / Marketing...,2.9,Affinity Solutions\n2.9,"New York, NY","New York, NY",51 to 200 employees,1998,Company - Private,Advertising & Marketing,Business Services,Unknown / Non-Applicable,"Commerce Signals, Cardlytics, Yodlee"


In [4]:
df.isnull().sum()

index                0
Job Title            0
Salary Estimate      0
Job Description      0
Rating               0
Company Name         0
Location             0
Headquarters         0
Size                 0
Founded              0
Type of ownership    0
Industry             0
Sector               0
Revenue              0
Competitors          0
dtype: int64

In [5]:
df['Salary Estimate'].describe()

count                             672
unique                             30
top       $79K-$131K (Glassdoor est.)
freq                               32
Name: Salary Estimate, dtype: object

### Clean that columns by removing unusable parts

In [6]:
df["Salary Estimate"] = (
    df["Salary Estimate"]
    .str.replace(r"\s*\(.*?\)", "", regex=True)
)

### Split into Min and Max columns for Salary Estimate & Add Avg column

In [7]:
salary = df["Salary Estimate"].str.replace(r"[^\d\-]", "", regex=True)

df[["min_salary", "max_salary"]] = salary.str.split("-", expand=True).astype(int)

In [8]:
df["min_salary"] *= 1000
df["max_salary"] *= 1000

In [9]:
df["avg_salary"] = (df["min_salary"] + df["max_salary"]) / 2

In [10]:
df[df['min_salary'] < 0]

Unnamed: 0,index,Job Title,Salary Estimate,Job Description,Rating,Company Name,Location,Headquarters,Size,Founded,Type of ownership,Industry,Sector,Revenue,Competitors,min_salary,max_salary,avg_salary


### Remove unecessary rating in Company Name

In [11]:
df['Company Name'] = df['Company Name'].apply(lambda x: x.split('\n')[0])

In [12]:
df['Location'].info()

<class 'pandas.core.series.Series'>
RangeIndex: 672 entries, 0 to 671
Series name: Location
Non-Null Count  Dtype 
--------------  ----- 
672 non-null    object
dtypes: object(1)
memory usage: 5.4+ KB


In [13]:
# Check for '-1' string or -1 integer in every column
for col in df.columns:
    # Count -1 (number) or '-1' (string)
    count = df[col].apply(lambda x: str(x) == '-1').sum()
    if count > 0:
        print(f"{col}: {count} hidden missing values")

Headquarters: 31 hidden missing values
Size: 27 hidden missing values
Founded: 118 hidden missing values
Type of ownership: 27 hidden missing values
Industry: 71 hidden missing values
Sector: 71 hidden missing values
Revenue: 27 hidden missing values
Competitors: 501 hidden missing values


In [14]:
df["job_state"] = df["Location"].apply(
    lambda x: x.split(",")[-1].strip() if "," in x else None
)

In [15]:
# See what locations failed
print(df[df['job_state'].isnull()]['Location'].unique())

['Remote' 'United States' 'Utah' 'New Jersey' 'Texas' 'California']


In [16]:
def get_state(location):
    if ',' in location:
        return location.split(',')[-1].strip()
    elif 'United States' in location:
        return 'US'
    elif 'Remote' in location:
        return 'Remote'
    # Handle full state names if they exist (simple mapping)
    elif 'Utah' in location:
        return 'UT'
    elif 'New Jersey' in location:
        return 'NJ'
    elif 'Texas' in location:
        return 'TX'
    elif 'California' in location:
        return 'CA'
    else:
        return location # Keep original if we can't fix it

df['job_state'] = df['Location'].apply(get_state)

In [17]:
print(df[df['job_state'].isnull()]['Location'].unique())

[]


In [18]:
# List of columns to clean
columns_to_clean = ['Size', 'Revenue', 'Industry', 'Sector', 'Type of ownership']

# Replace '-1' and 'Unknown / Non-Applicable' with 'Unknown'
for col in columns_to_clean:
    df[col] = df[col].apply(lambda x: 'Unknown' if '-1' in str(x) or 'Unknown' in str(x) else x)

# Check the result
df['Size'].value_counts()

Size
51 to 200 employees        135
1001 to 5000 employees     104
1 to 50 employees           86
201 to 500 employees        85
10000+ employees            80
501 to 1000 employees       77
5001 to 10000 employees     61
Unknown                     44
Name: count, dtype: int64

In [19]:
# Clean Rating
df['Rating'] = df['Rating'].apply(lambda x: np.nan if x < 0 else x)

# Clean Founded
df['Founded'] = df['Founded'].apply(lambda x: np.nan if x < 0 else x)

In [20]:
import datetime
current_year = datetime.datetime.now().year

# Calculate age. If 'Founded' is NaN, 'age' will be NaN automatically.
df['company_age'] = current_year - df['Founded']

# Check strictly positive ages
df['company_age'].describe()

count    554.000000
mean      41.895307
std       40.653609
min        7.000000
25%       16.000000
50%       27.000000
75%       52.000000
max      245.000000
Name: company_age, dtype: float64

In [21]:
# If the value is '-1', they have 0 competitors listed.
# Otherwise, count the commas and add 1 (e.g., "A, B" = 1 comma + 1 = 2 items)
df['num_competitors'] = df['Competitors'].apply(lambda x: 0 if str(x) == '-1' else len(x.split(',')))

print(df['num_competitors'].value_counts())

num_competitors
0    501
3    135
2     30
1      6
Name: count, dtype: int64


In [22]:
df.head()

Unnamed: 0,index,Job Title,Salary Estimate,Job Description,Rating,Company Name,Location,Headquarters,Size,Founded,...,Industry,Sector,Revenue,Competitors,min_salary,max_salary,avg_salary,job_state,company_age,num_competitors
0,0,Sr Data Scientist,$137K-$171K,Description\n\nThe Senior Data Scientist is re...,3.1,Healthfirst,"New York, NY","New York, NY",1001 to 5000 employees,1993.0,...,Insurance Carriers,Insurance,Unknown,"EmblemHealth, UnitedHealth Group, Aetna",137000,171000,154000.0,NY,33.0,3
1,1,Data Scientist,$137K-$171K,"Secure our Nation, Ignite your Future\n\nJoin ...",4.2,ManTech,"Chantilly, VA","Herndon, VA",5001 to 10000 employees,1968.0,...,Research & Development,Business Services,$1 to $2 billion (USD),-1,137000,171000,154000.0,VA,58.0,0
2,2,Data Scientist,$137K-$171K,Overview\n\n\nAnalysis Group is one of the lar...,3.8,Analysis Group,"Boston, MA","Boston, MA",1001 to 5000 employees,1981.0,...,Consulting,Business Services,$100 to $500 million (USD),-1,137000,171000,154000.0,MA,45.0,0
3,3,Data Scientist,$137K-$171K,JOB DESCRIPTION:\n\nDo you have a passion for ...,3.5,INFICON,"Newton, MA","Bad Ragaz, Switzerland",501 to 1000 employees,2000.0,...,Electrical & Electronic Manufacturing,Manufacturing,$100 to $500 million (USD),"MKS Instruments, Pfeiffer Vacuum, Agilent Tech...",137000,171000,154000.0,MA,26.0,3
4,4,Data Scientist,$137K-$171K,Data Scientist\nAffinity Solutions / Marketing...,2.9,Affinity Solutions,"New York, NY","New York, NY",51 to 200 employees,1998.0,...,Advertising & Marketing,Business Services,Unknown,"Commerce Signals, Cardlytics, Yodlee",137000,171000,154000.0,NY,28.0,3


In [23]:
import re
from collections import Counter

# 1. Cleaning function to tokenize and clean text
def clean_text(text):
    text = str(text).lower()
    # FIX 1: Replace special chars with SPACE ' ', not empty string ''
    # This prevents "Python/R" from becoming "pythonr"
    text = re.sub(r'[^a-z\s]', ' ', text)
    # Split into words
    words = text.split()
    return words

# 2. Collect all words from all job descriptions
all_words = []
for description in df['Job Description']:
    all_words.extend(clean_text(description))

# 3. Define stop words (Your original list)
stop_words = set([
    'and', 'to', 'the', 'of', 'a', 'in', 'for', 'with', 'is', 'as', 'on', 'are', 'that', 'be', 'or', 'an', 
    'will', 'our', 'we', 'this', 'data', 'experience', 'work', 'team', 'business', 'skills', 'years', 
    'development', 'science', 'analysis', 'solutions', 'support', 'technical', 'including', 'knowledge', 
    'product', 'management', 'learning', 'machine', 'models', 'analytics', 'company', 'research', 'new', 
    'engineering', 'project', 'strong', 'environment', 'systems', 'role', 'us', 'opportunity', 'working', 
    'related', 'using', 'degree', 'design', 'other', 'all', 'can', 'have', 'from', 'at', 'by', 'your', 
    'complex', 'across', 'ability', 'information', 'technologies', 'tools', 'best', 'practices', 'algorithms', 
    'statistical', 'large', 'projects', 'help', 'understanding', 'required', 'services', 'provide', 'build', 
    'status', 'employment', 'equal', 'gender', 'race', 'color', 'national', 'origin', 'sexual', 'orientation', 
    'identity', 'disability', 'veteran', 'protected', 'law', 'job', 'position', 'requirements', 'qualifications'
])

# 4. Filter out stop words
# FIX 2: Modified condition to allow 'r' even if length is 1
filtered_words = [word for word in all_words if word not in stop_words and (len(word) > 1 or word == 'r')]

# 5. Count frequencies
word_counts = Counter(filtered_words)

# 6. Display top 30 most common words
print("--- Top 30 Common Words ---")
print(word_counts.most_common(30))

# 7. Check specifically for common data science skills
common_skills = ['python', 'r', 'sql', 'java', 'scala', 'aws', 'spark', 'excel', 'tableau', 'hadoop', 'azure', 'tensorflow', 'keras', 'pytorch', 'scikit', 'pandas', 'numpy']
print("\n--- Specific Skill Counts ---")
for skill in common_skills:
    print(f"{skill}: {word_counts[skill]}")

--- Top 30 Common Words ---
[('you', 2268), ('scientist', 741), ('python', 646), ('software', 639), ('such', 631), ('develop', 618), ('computer', 614), ('more', 599), ('about', 587), ('techniques', 540), ('it', 517), ('technology', 506), ('their', 502), ('insights', 494), ('advanced', 476), ('modeling', 473), ('time', 473), ('statistics', 468), ('use', 467), ('problems', 462), ('not', 454), ('sql', 453), ('r', 451), ('based', 447), ('world', 443), ('building', 440), ('into', 438), ('must', 419), ('teams', 415), ('analytical', 411)]

--- Specific Skill Counts ---
python: 646
r: 451
sql: 453
java: 167
scala: 104
aws: 179
spark: 224
excel: 95
tableau: 153
hadoop: 163
azure: 77
tensorflow: 116
keras: 43
pytorch: 60
scikit: 69
pandas: 67
numpy: 57


In [24]:
# Create columns for the Top 10 Skills
# Note: For 'R', we use your regex logic to be safe. For others, simple string matching is fine.

# 1. Python
df['python_yn'] = df['Job Description'].apply(lambda x: 1 if 'python' in x.lower() else 0)

# 2. R (Using regex to catch "R" but avoid "R&D" or "Reader")
import re
df['R_yn'] = df['Job Description'].apply(lambda x: 1 if re.search(r'\br\b', str(x).lower()) else 0)

# 3. SQL
df['sql_yn'] = df['Job Description'].apply(lambda x: 1 if 'sql' in x.lower() else 0)

# 4. Spark
df['spark_yn'] = df['Job Description'].apply(lambda x: 1 if 'spark' in x.lower() else 0)

# 5. AWS
df['aws_yn'] = df['Job Description'].apply(lambda x: 1 if 'aws' in x.lower() else 0)

# 6. Java
df['java_yn'] = df['Job Description'].apply(lambda x: 1 if 'java' in x.lower() else 0)

# 7. Hadoop
df['hadoop_yn'] = df['Job Description'].apply(lambda x: 1 if 'hadoop' in x.lower() else 0)

# 8. Tableau
df['tableau_yn'] = df['Job Description'].apply(lambda x: 1 if 'tableau' in x.lower() else 0)

# 9. TensorFlow
df['tensorflow_yn'] = df['Job Description'].apply(lambda x: 1 if 'tensorflow' in x.lower() else 0)

# 10. Scala
df['scala_yn'] = df['Job Description'].apply(lambda x: 1 if 'scala' in x.lower() else 0)

# Check the first few rows to see the new columns
df[['python_yn', 'R_yn', 'sql_yn', 'aws_yn']].head()

Unnamed: 0,python_yn,R_yn,sql_yn,aws_yn
0,0,0,0,1
1,0,0,1,0
2,1,1,0,1
3,1,0,1,1
4,1,1,1,0


In [25]:
df[['python_yn', 'R_yn', 'sql_yn', 'aws_yn']].sum()

python_yn    491
R_yn         314
sql_yn       356
aws_yn       174
dtype: int64