**Multi Label Text Classification**

In [1]:
import os
import pandas as pd
import numpy as np
from scipy.stats import randint
import seaborn as sns # used for plot interactive graph.
import matplotlib.pyplot as plt
import seaborn as sns
from io import StringIO
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_selection import chi2
from IPython.display import display
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import LinearSVC
from sklearn.model_selection import cross_val_score
from sklearn.metrics import confusion_matrix
from sklearn import metrics

df = pd.read_csv ("Grievance_New.csv", encoding='latin-1')

pd.DataFrame(df.offense_tag.unique()).values

df.replace({'offense_tag':
             {'12.1.1': 'attendance, punctuality, cutting classes',
              '12.1.3': 'property misuse',
             '12.1.4': 'noise disturbance',
             '12.1.5': 'posting violation',
             '12.1.6': 'notice removal',
             '12.1.7': 'littering',
              '12.1.8': 'smoking outside violation',
              '12.1.9': 'trespassing',
              '12.1.10': 'misconduct',
              '12.1.11': 'harassment',
              '12.1.12': 'provocation, fight',
              '12.1.13': 'PDA',
              '12.1.14': 'truancy',
              '13.1': 'repeat offenses',
              '13.2': 'insubordination',
              '13.3': 'smoking inside violation',
              '13.4': 'alcohol violation',
              '13.5': 'intoxication',
              '13.6': 'trespassing',
              '13.7': 'property misuse',
              '13.8': 'endangerment',
              '13.9': 'gambling',
              '13.10': 'identity fraud',
              '13.11': 'trademark misuse',
              '13.12': 'false representation',
              '13.13': 'abusive behavior',
              '13.14': 'unauthorized membership',
              '13.15': 'online misconduct',
              '13.16': 'vandalism',
              '13.17': 'academic disruption',
              '13.18': 'solicitation',
              '13.19': 'slight physical injury',
              '13.20': 'bladed weapons possession',
              '13.21': 'theft'}},
            inplace= True)

df['category_id'] = df['offense_tag'].factorize()[0]
category_id_df = df[['offense_tag', 'category_id']].drop_duplicates()

category_to_id = dict(category_id_df.values)
id_to_category = dict(category_id_df[['category_id', 'offense_tag']].values)

tfidf = TfidfVectorizer(sublinear_tf=True, min_df=5,
                        ngram_range=(1, 2),
                        stop_words='english')
# We transform each complaint into a vector
features = tfidf.fit_transform(df.grievance).toarray()
labels = df.category_id

N = 3
for offense_tag, category_id in sorted(category_to_id.items()):
  features_chi2 = chi2(features, labels == category_id)
  indices = np.argsort(features_chi2[0])
  feature_names = np.array(tfidf.get_feature_names_out())[indices]
  unigrams = [v for v in feature_names if len(v.split(' ')) == 1]
  bigrams = [v for v in feature_names if len(v.split(' ')) == 2]

X = df['grievance'] # Collection of documents
y = df['offense_tag'] # Target or the labels we want to predict (i.e., the 21 different offenses of complaints)
X_train, X_test, y_train, y_test = train_test_split(X, y,
                                                    test_size=0.25,
                                                    random_state = 0)

models = [
    RandomForestClassifier(n_estimators=100, max_depth=5, random_state=0),
    LinearSVC(),
    MultinomialNB(),
    LogisticRegression(random_state=0),
]
# 5 Cross-validation
CV = 5
cv_df = pd.DataFrame(index=range(CV * len(models)))
entries = []
for model in models:
  model_name = model.__class__.__name__
  accuracies = cross_val_score(model, features, labels, scoring='accuracy', cv=CV)
  for fold_idx, accuracy in enumerate(accuracies):
    entries.append((model_name, fold_idx, accuracy))
cv_df = pd.DataFrame(entries, columns=['model_name', 'fold_idx', 'accuracy'])

X_train, X_test, y_train, y_test,indices_train,indices_test = train_test_split(features,
                                                               labels,
                                                               df.index, test_size=0.25,
                                                               random_state=1)
model = LinearSVC()
model.fit(X_train, y_train)
y_pred = model.predict(X_test)

X_train, X_test, y_train, y_test = train_test_split(X, y,
                                                    test_size=0.25,
                                                    random_state = 0)
tfidf = TfidfVectorizer(sublinear_tf=True, min_df=5,
                        ngram_range=(1, 2),
                        stop_words='english')
fitted_vectorizer = tfidf.fit(X_train)
tfidf_vectorizer_vectors = fitted_vectorizer.transform(X_train)
model = LinearSVC().fit(tfidf_vectorizer_vectors, y_train)

complaint = "I'm a student filing about a classmate who keeps pestering me to share my class notes after I've asked him to stop. I find his repeated pestering to be unjust vexation and request disciplinary action."
print(model.predict(fitted_vectorizer.transform([complaint])))



['harassment']


