# Import Libraries

In [550]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import pickle

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn import metrics
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

# Importing Data

In [551]:
with open('new_eda_data.pkl', 'rb') as file:
    df = pickle.load(file)

# select columns for modelling
df_tags = df[['tags', 'target']].copy()

df_tags.reset_index(drop=True, inplace=True)

df_tags

Unnamed: 0,tags,target
0,[],1
1,[],0
2,[],1
3,[],0
4,[],0
...,...,...
8499,"[Data Analyst, How to become a data analyst, D...",1
8500,"[Data Analyst, How to become a Data Analyst, B...",1
8501,"[Data Analyst Salary, Data analyst with no exp...",1
8502,"[Working at a big company, Big company data an...",1


# Tags Column Mapping

In [552]:
# store unique tag count across df
unique_tag_count = {}

# iterate through unique tags and map their count
for tag_list in df_tags['tags']:

    for tag in tag_list:
        if tag.lower() in unique_tag_count:
            unique_tag_count[tag.lower()] += 1
        else:
            unique_tag_count[tag.lower()] = 1

# sort unique tag count alphabetically
unique_tag_count = dict(sorted(unique_tag_count.items()))

# find out how many unique tags we have
print(len(unique_tag_count))

31997


In [553]:
unique_tag_count = {}
pos_tag_count = {}
neg_tag_count = {}

for index, row in df_tags.iterrows():
    tag_list = row['tags']

    for tag in tag_list:
        if tag.lower() in unique_tag_count:
            unique_tag_count[tag.lower()] += 1
        else:
            unique_tag_count[tag.lower()] = 1

        if row['target'] == 1:
            if tag.lower() in pos_tag_count:
                pos_tag_count[tag.lower()] += 1
            else:
                pos_tag_count[tag.lower()] = 1

        if row['target'] == 0:
            if tag.lower() in neg_tag_count:
                neg_tag_count[tag.lower()] += 1
            else:
                neg_tag_count[tag.lower()] = 1




In [554]:
# Convert dictionaries to DataFrames
df_unique = pd.DataFrame(list(unique_tag_count.items()), columns=['Tag', 'Unique_Count'])
df_pos = pd.DataFrame(list(pos_tag_count.items()), columns=['Tag', 'Positive_Count'])
df_neg = pd.DataFrame(list(neg_tag_count.items()), columns=['Tag', 'Negative_Count'])

# Merge DataFrames on the 'Tag' column
merged_df = pd.merge(df_unique, df_pos, on='Tag', how='outer')
merged_df = pd.merge(merged_df, df_neg, on='Tag', how='outer')

# Fill NaN values with 0
merged_df = merged_df.fillna(0).astype({'Unique_Count': int, 'Positive_Count': int, 'Negative_Count': int})

# Display the merged DataFrame
merged_df


Unnamed: 0,Tag,Unique_Count,Positive_Count,Negative_Count
0,power bi,100,83,17
1,data science,3084,1461,1623
2,data scientist,821,353,468
3,self-taugh data scientist,66,60,6
4,big tech,66,59,7
...,...,...,...,...
31992,data analyst at big company,1,1,0
31993,how to learn to become a data analyst,1,1,0
31994,top skills for data analyst,1,1,0
31995,top 3 skills for data analyst,1,1,0


In [None]:
# TF-IDF =  Term Frequency * log(Number of Documents/Document Frequency)

In [555]:
merged_df['TF-IDF'] = merged_df['Unique_Count'] * np.log(8504/merged_df['Unique_Count'])

In [None]:
1000  |  500  |  500 | 7905    | 7905
500   |  400  |  100 | 10,000  | 16,000

In [None]:
TF-IDF * 1 + abs(pos% - neg%)

7905 * 1 + abs(500/1000*100 - 500/1000*100)

7905 * 1 + abs(50 - 50)

7905 * 1+0

7905

In [None]:
TF-IDF * 1 + abs(pos% - neg%)

500 * 1 + abs(400/500*100 - 100/500*100)

10,000 * 1 + abs(80 - 20)

10,000 * 1+0.6

16,000

In [573]:
print(score1(100, 40, 60))
print(score1(100, 50, 50))
print(score1(100, 60, 40))

9330.555641470753
444.31217340336923
9330.555641470753


In [556]:
merged_df.sort_values(by='TF-IDF', ascending=False).tail(20)

Unnamed: 0,Tag,Unique_Count,Positive_Count,Negative_Count,TF-IDF
12585,python re module,1,1,0,9.048292
12584,re module,1,1,0,9.048292
12583,python regex module,1,1,0,9.048292
12582,python regex tutorial,1,1,0,9.048292
12581,python regular expressions,1,1,0,9.048292
12580,how to use regular expression in python,1,1,0,9.048292
12579,learn regular expression,1,1,0,9.048292
12577,python regex,1,1,0,9.048292
12576,regular expressions tutorial for beginners in ...,1,1,0,9.048292
12574,access files present in different path,1,1,0,9.048292


In [557]:
merged_df['Balance'] = merged_df['Positive_Count']/merged_df['Unique_Count'] - merged_df['Negative_Count']/merged_df['Unique_Count']

In [558]:
merged_df['Score'] = merged_df['TF-IDF'] * (1 +(np.abs(merged_df['Balance'])))

In [559]:
merged_df.sort_values(by='Score', ascending=False).head(20)

Unnamed: 0,Tag,Unique_Count,Positive_Count,Negative_Count,TF-IDF,Balance,Score
1,data science,3084,1461,1623,3128.129528,-0.052529,3292.447616
5,machine learning,2251,1216,1035,2991.94384,0.080409,3232.522176
7466,datacamp,780,110,670,1863.41844,-0.717949,3201.25732
5127,great learning,1556,641,915,2642.738743,-0.176093,3108.105334
20804,great lakes,1067,403,664,2214.756609,-0.244611,2756.510568
2672,yt:cc=on,808,578,230,1901.813728,0.430693,2720.91172
13,data analytics,1270,599,671,2414.930071,-0.056693,2551.839492
11,big data,1032,467,565,2176.527189,-0.094961,2383.212911
6,python,1075,503,572,2223.332178,-0.064186,2366.039081
1827,artificial intelligence,715,242,473,1770.346705,-0.323077,2342.304871


In [570]:
refined_df = merged_df[np.abs(merged_df['Balance']) > 0.4]
refined_df.sort_values(by='Score', ascending=False).head(20)

Unnamed: 0,Tag,Unique_Count,Positive_Count,Negative_Count,TF-IDF,Balance,Score
7466,datacamp,780,110,670,1863.41844,-0.717949,3201.25732
2672,yt:cc=on,808,578,230,1901.813728,0.430693,2720.91172
843,how to learn data science,400,73,327,1222.730949,-0.635,1999.165102
179,data analyst career,280,235,45,955.780649,0.678571,1604.346089
7298,data analyst job,305,241,64,1015.033944,0.580328,1604.086429
1280,r tutorial,325,93,232,1060.95169,-0.427692,1514.712566
2001,ml,321,94,227,1051.869106,-0.41433,1487.690262
6642,statquest,209,195,14,774.545153,0.866029,1445.323491
7062,analytics,234,37,197,840.755168,-0.683761,1415.630497
16186,rtutorial,209,22,187,774.545153,-0.789474,1386.028168


In [585]:
top_1000_df = refined_df.sort_values(by='Score', ascending=False).head(1000).copy()
top_1000_df['pos_or_neg'] = top_1000_df['Balance'].apply(lambda x: 1 if x>0 else 0)
top_1000_df['pos_or_neg'].value_counts()

0    511
1    489
Name: pos_or_neg, dtype: int64

In [574]:
top_1000_features = list(refined_df.sort_values(by='Score', ascending=False).head(1000)['Tag'])

In [575]:
len(top_1000_features)

1000

In [576]:
# create empty columns of all unique tags
for tag in top_1000_features:
    df_tags[tag] = df_tags['tags'].apply(lambda x: 1 if tag in [i.lower() for i in x] else 0)

  df_tags[tag] = df_tags['tags'].apply(lambda x: 1 if tag in [i.lower() for i in x] else 0)
  df_tags[tag] = df_tags['tags'].apply(lambda x: 1 if tag in [i.lower() for i in x] else 0)
  df_tags[tag] = df_tags['tags'].apply(lambda x: 1 if tag in [i.lower() for i in x] else 0)
  df_tags[tag] = df_tags['tags'].apply(lambda x: 1 if tag in [i.lower() for i in x] else 0)
  df_tags[tag] = df_tags['tags'].apply(lambda x: 1 if tag in [i.lower() for i in x] else 0)
  df_tags[tag] = df_tags['tags'].apply(lambda x: 1 if tag in [i.lower() for i in x] else 0)
  df_tags[tag] = df_tags['tags'].apply(lambda x: 1 if tag in [i.lower() for i in x] else 0)
  df_tags[tag] = df_tags['tags'].apply(lambda x: 1 if tag in [i.lower() for i in x] else 0)
  df_tags[tag] = df_tags['tags'].apply(lambda x: 1 if tag in [i.lower() for i in x] else 0)
  df_tags[tag] = df_tags['tags'].apply(lambda x: 1 if tag in [i.lower() for i in x] else 0)
  df_tags[tag] = df_tags['tags'].apply(lambda x: 1 if tag in [i.lower() for i in

In [587]:
df_tags.head()

Unnamed: 0,tags,target,datacamp,yt:cc=on,how to learn data science,data analyst career,data analyst job,r tutorial,ml,statquest,...,regex,exploring,interactive graphics in r,web scrape,credit risk modeling,c (programming language),connect 4 strategy,what is cluster analysis,rcode,refactoring in software engineering
0,[],1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,[],0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,[],1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,[],0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,[],0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


# Train, Test, Split

In [577]:
# features
X = df_tags.drop(columns=['tags', 'target'])
# target
y = df_tags['target']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=14)

# Modelling

In [578]:
# create logistic regression object and fit it with training data
logreg = LogisticRegression(max_iter=200)
logreg.fit(X_train, y_train)

In [579]:
# function to calculate the accuracy, precision and recall
def apr(y_pred, y_real):       
    
    """ Calculates accuracy, precision, recall
        Requires predicted value first, and then the real value
    """
    accuracy = metrics.accuracy_score(y_real, y_pred)
    precision = metrics.precision_score(y_real, y_pred)
    recall = metrics.recall_score(y_real, y_pred)
    f1 = metrics.f1_score(y_real, y_pred)

    print(f"Accuracy:{accuracy}")
    print(f"Precision:{precision}")
    print(f"Recall:{recall}")
    print(f"F1:{f1}")

In [580]:
# predict on train data
y_train_pred = logreg.predict(X_train)

# check train data metric scores
apr(y_train_pred, y_train)

Accuracy:0.7421725709245921
Precision:0.8221528861154446
Recall:0.6189078097475044
F1:0.7061976549413735


In [581]:
# predict on test data
y_test_pred = logreg.predict(X_test)

# check test data metric scores
apr(y_test_pred, y_test)

Accuracy:0.7125220458553791
Precision:0.7736434108527132
Recall:0.5926365795724465
F1:0.6711499663752521


# Score Calculations

In [358]:
def score1(uc, pc, nc):
    tfidf = uc * np.log(8504/uc)
    weight = 1 + np.abs(pc - nc)

    score = tfidf * weight

    return score

In [359]:
def score2(uc, pc, nc):
    tfidf = uc * np.log(8504/uc)
    weight = 1 + np.square(pc - nc)

    score = tfidf * weight

    return score

In [360]:
print(score1(1000, 500, 500))
print(score1(500, 400, 100))
print(score1(50, 40, 10))
print(score2(1000, 500, 500))
print(score2(500, 400, 100))
print(score2(50, 40, 10))

2140.536641039647
426469.4151507386
7961.216817620138
2140.536641039647
127517188.81389245
231388.91460244334


# Testing Max_Features

In [361]:
# create function for optimum feature selection for model
def max_features(low, high, step):

    max_list = list(range(low,(high+1),step))
    train_accuracy = []
    test_accuracy = []

    for x in max_list:
        
        classifier = LogisticRegression(max_iter=500)
        classifier.fit(X_train, y_train)

        y_pred_train = classifier.predict(X_train)
        acc = accuracy_score(y_train, y_pred_train)
        train_accuracy.append(acc)

        y_pred_test = classifier.predict(X_test)
        acc2 = accuracy_score(y_test, y_pred_test)
        test_accuracy.append(acc2)

    data = {'max_features': max_list, 'train_accuracy': train_accuracy, 'test_accuracy': test_accuracy}

    max_df = pd.DataFrame(data)

    max_df['diff'] = max_df.train_accuracy - max_df.test_accuracy

    return max_df

In [362]:
max_features(100, 1000, 100)

Unnamed: 0,max_features,train_accuracy,test_accuracy,diff
0,100,0.767015,0.717225,0.049789
1,200,0.767015,0.717225,0.049789
2,300,0.767015,0.717225,0.049789
3,400,0.767015,0.717225,0.049789
4,500,0.767015,0.717225,0.049789
5,600,0.767015,0.717225,0.049789
6,700,0.767015,0.717225,0.049789
7,800,0.767015,0.717225,0.049789
8,900,0.767015,0.717225,0.049789
9,1000,0.767015,0.717225,0.049789
