In [1]:
import pandas as pd
import seaborn as sns 
import re
import numpy as np
import matplotlib.pyplot as plt
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.svm import SVC
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report,confusion_matrix
from sklearn.metrics import roc_curve, auc
%matplotlib inline

ModuleNotFoundError: No module named 'pandas'

In [2]:
data = pd.read_csv('train_with_label.csv')

In [3]:
final_data = data[['Category', 'Title', 'FullDescription']]
final_data.groupby(by = 'Category').count()

Unnamed: 0_level_0,Title,FullDescription
Category,Unnamed: 1_level_1,Unnamed: 2_level_1
Business/Finance,70494,70494
Charity/Volunteering,2332,2332
Consulting,3263,3263
Education/Research,15126,15126
Healthcare,21075,21076
Human Resources,7713,7713
Law,3939,3939
Others,53713,53713
Policy/Government/Social Work,3455,3455
Technology,63657,63657


In [4]:
final_data = pd.concat([final_data[final_data['Category']=='Business/Finance'].sample(n=2238),
                        final_data[final_data['Category']=='Charity/Volunteering'].sample(n=2238),
                final_data[final_data['Category']== 'Consulting'].sample(n=2238), 
                final_data[final_data['Category']=='Education/Research'].sample(n=2238),
                final_data[final_data['Category']=='Healthcare'].sample(n=2238),
                final_data[final_data['Category']=='Human Resources'].sample(n=2238),
                final_data[final_data['Category']=='Law'].sample(n=2238),
                final_data[final_data['Category']=='Others'].sample(n=2238),
                final_data[final_data['Category']=='Policy/Government/Social Work'].sample(n=2238),
                final_data[final_data['Category']=='Technology'].sample(n=2238)]
                )

In [6]:
def clean_text(row):
    
    row = re.sub(r"n\'t", " not", row)
    row = re.sub(r"n\'ll", " will", row)
    row = re.sub(r"n\'ve", " have", row)
    row = re.sub(r"n\'t", " not", row)
    row = re.sub(r"i.e", " ", row)
    row = re.sub(r"n\'s", "", row)
    row = re.sub("[^a-zA-Z]"," ", row)
    white_space = re.compile(r"\s+")
    row = white_space.sub(" ", row).strip()
    return row 

In [7]:
final_data['FullDescription'] = (final_data['FullDescription']).apply(lambda row: clean_text(row))

In [8]:
final_data['FullDescription'].tail(1)

227755    NET Developer ASP NET C C NET dot NET Web Appl...
Name: FullDescription, dtype: object

In [9]:
final_data.reset_index(inplace = True)

In [10]:
final_data["concat_text"] = final_data.FullDescription.astype(str) + final_data.Title.astype(str)

In [11]:
stopwords = set(['i','l','my','it','off','means','if','you','husband','do','what','and','a','an'
             'is','for','this','after','the','so','to','m', 'that','into','those','were','was',
             'other','some','are','now','ry','at','serv','t','s','rece',
             'in','don','adv','word','let','her','him','he','she','them','they','be','been',
             've','some', 'such','qu','same','only','up','here','there','do','very','over',
             'but','via','felt','who','whom','whose','where','how','about','just','most','has',
             'had','have','way','back','front','let','flow','sun','del','your','move','got','air',
              'breath', 'dude','know','mean','pan','means','mine','both','with','another','bit',
              'clumps','needs','room','code','one','ones','f', 'em','as','n','cho','me','descr',
              'pr','compet','re','could','would','should','even','r','out','their','n','ly','down',
              'from','because','until','unless','while','its','about','all','any','few','too',
              'own','itself','ppl','keep','really','got','AP','close'])

vect = TfidfVectorizer(lowercase = True, stop_words = {'english'}.update(stopwords), max_df = .8, min_df = 3) 
X = vect.fit_transform(final_data.concat_text)
y = final_data.Category

In [12]:
print(X.shape,y.shape)

(22380, 18867) (22380,)


In [13]:
lg = LogisticRegression() 
clf = LogisticRegression(penalty='l2',C = 10)
train_idx, test_idx = train_test_split(np.arange(final_data.shape[0]), test_size=0.4, 
                                       shuffle=True, random_state=42)
X_train = X[train_idx]
Y_train = y[train_idx]
X_test = X[test_idx]
Y_test = y[test_idx]
clf.fit(X_train, Y_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


LogisticRegression(C=10)

In [14]:
y_pred = clf.predict(X_test)

In [16]:
y_pred

array(['Consulting', 'Education/Research',
       'Policy/Government/Social Work', ..., 'Education/Research',
       'Education/Research', 'Others'], dtype=object)

In [18]:
Y_test

14223                              Law
3498              Charity/Volunteering
19536    Policy/Government/Social Work
18407    Policy/Government/Social Work
10714                       Healthcare
                     ...              
1886                  Business/Finance
8537                Education/Research
6829                Education/Research
10609                       Healthcare
16694                           Others
Name: Category, Length: 8952, dtype: object

In [15]:
print(classification_report(Y_test, y_pred))

                               precision    recall  f1-score   support

             Business/Finance       0.53      0.61      0.57       866
         Charity/Volunteering       0.75      0.77      0.76       920
                   Consulting       0.50      0.48      0.49       907
           Education/Research       0.76      0.74      0.75       923
                   Healthcare       0.76      0.74      0.75       905
              Human Resources       0.77      0.75      0.76       866
                          Law       0.96      0.92      0.94       902
                       Others       0.57      0.56      0.56       908
Policy/Government/Social Work       0.73      0.71      0.72       866
                   Technology       0.70      0.74      0.72       889

                     accuracy                           0.70      8952
                    macro avg       0.70      0.70      0.70      8952
                 weighted avg       0.70      0.70      0.70      8952



In [19]:
industry_pred = ['Business/Finance', 'Education/Research']
industry = ['Business/Finance', 'Business/Finance']

In [21]:
print(classification_report(industry_pred, industry))

                    precision    recall  f1-score   support

  Business/Finance       0.50      1.00      0.67         1
Education/Research       0.00      0.00      0.00         1

          accuracy                           0.50         2
         macro avg       0.25      0.50      0.33         2
      weighted avg       0.25      0.50      0.33         2



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
