In [1]:
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import matplotlib.lines as mlines

import nltk
from nltk.tokenize import ToktokTokenizer
from nltk.stem.wordnet import WordNetLemmatizer
from nltk.corpus import stopwords
from string import punctuation

import re
from bs4 import BeautifulSoup

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.model_selection import learning_curve
from sklearn.model_selection import ShuffleSplit
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.linear_model import SGDClassifier
from sklearn.multiclass import OneVsRestClassifier
from sklearn.metrics import hamming_loss
from sklearn.metrics import confusion_matrix

from scipy.sparse import hstack

In [2]:
questions = pd.read_csv("Dataset/Questions.csv",encoding='latin-1')
questions.head()

Unnamed: 0,Id,OwnerUserId,CreationDate,Score,Title,Body
0,469,147.0,2008-08-02T15:11:16Z,21,How can I find the full path to a font from it...,<p>I am using the Photoshop's javascript API t...
1,502,147.0,2008-08-02T17:01:58Z,27,Get a preview JPEG of a PDF on Windows?,<p>I have a cross-platform (Python) applicatio...
2,535,154.0,2008-08-02T18:43:54Z,40,Continuous Integration System for a Python Cod...,<p>I'm starting work on a hobby project with a...
3,594,116.0,2008-08-03T01:15:08Z,25,cx_Oracle: How do I iterate over a result set?,<p>There are several ways to iterate over a re...
4,683,199.0,2008-08-03T13:19:16Z,28,Using 'in' to match an attribute of Python obj...,<p>I don't remember whether I was dreaming or ...


In [3]:
questions.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 607282 entries, 0 to 607281
Data columns (total 6 columns):
 #   Column        Non-Null Count   Dtype  
---  ------        --------------   -----  
 0   Id            607282 non-null  int64  
 1   OwnerUserId   601070 non-null  float64
 2   CreationDate  607282 non-null  object 
 3   Score         607282 non-null  int64  
 4   Title         607282 non-null  object 
 5   Body          607282 non-null  object 
dtypes: float64(1), int64(2), object(3)
memory usage: 27.8+ MB


In [4]:
tags = pd.read_csv("Dataset/Tags.csv", dtype={'Tag':str})
tags.head()

Unnamed: 0,Id,Tag
0,469,python
1,469,osx
2,469,fonts
3,469,photoshop
4,502,python


In [5]:
tags.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1885078 entries, 0 to 1885077
Data columns (total 2 columns):
 #   Column  Dtype 
---  ------  ----- 
 0   Id      int64 
 1   Tag     object
dtypes: int64(1), object(1)
memory usage: 28.8+ MB


In [6]:
tags['Tag'] = tags['Tag'].astype(str)

In [7]:
grouped_tags = tags.groupby("Id")['Tag'].apply(lambda tags: ' '.join(tags))

In [8]:
grouped_tags.head()

Id
469                           python osx fonts photoshop
502                             python windows image pdf
535    python continuous-integration extreme-programming
594                 python sql database oracle cx-oracle
683                              python arrays iteration
Name: Tag, dtype: object

In [9]:
grouped_tags.reset_index()

Unnamed: 0,Id,Tag
0,469,python osx fonts photoshop
1,502,python windows image pdf
2,535,python continuous-integration extreme-programming
3,594,python sql database oracle cx-oracle
4,683,python arrays iteration
...,...,...
607278,40143190,python bash multiline
607279,40143228,python selenium-webdriver
607280,40143267,python django django-rest-framework
607281,40143338,python


In [10]:
grouped_tags_final = pd.DataFrame({'Id':grouped_tags.index, 'Tags':grouped_tags.values})

In [11]:
grouped_tags_final.head(5)

Unnamed: 0,Id,Tags
0,469,python osx fonts photoshop
1,502,python windows image pdf
2,535,python continuous-integration extreme-programming
3,594,python sql database oracle cx-oracle
4,683,python arrays iteration


In [12]:
questions.drop(columns=['OwnerUserId', 'CreationDate'], inplace=True)

In [13]:
questions = questions.merge(grouped_tags_final, on='Id')
questions.head()

Unnamed: 0,Id,Score,Title,Body,Tags
0,469,21,How can I find the full path to a font from it...,<p>I am using the Photoshop's javascript API t...,python osx fonts photoshop
1,502,27,Get a preview JPEG of a PDF on Windows?,<p>I have a cross-platform (Python) applicatio...,python windows image pdf
2,535,40,Continuous Integration System for a Python Cod...,<p>I'm starting work on a hobby project with a...,python continuous-integration extreme-programming
3,594,25,cx_Oracle: How do I iterate over a result set?,<p>There are several ways to iterate over a re...,python sql database oracle cx-oracle
4,683,28,Using 'in' to match an attribute of Python obj...,<p>I don't remember whether I was dreaming or ...,python arrays iteration


In [14]:
filtered_questions = questions[questions['Score']>5]

In [15]:
filtered_questions.drop(columns = ['Id', 'Score'], inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  filtered_questions.drop(columns = ['Id', 'Score'], inplace=True)


In [16]:
filtered_questions.head()

Unnamed: 0,Title,Body,Tags
0,How can I find the full path to a font from it...,<p>I am using the Photoshop's javascript API t...,python osx fonts photoshop
1,Get a preview JPEG of a PDF on Windows?,<p>I have a cross-platform (Python) applicatio...,python windows image pdf
2,Continuous Integration System for a Python Cod...,<p>I'm starting work on a hobby project with a...,python continuous-integration extreme-programming
3,cx_Oracle: How do I iterate over a result set?,<p>There are several ways to iterate over a re...,python sql database oracle cx-oracle
4,Using 'in' to match an attribute of Python obj...,<p>I don't remember whether I was dreaming or ...,python arrays iteration


In [17]:
filtered_questions['Tags'] = filtered_questions['Tags'].apply(lambda x: x.split())

all_tags = [item for sublist in filtered_questions['Tags'].values for item in sublist]

print(len(all_tags))

my_set = set(all_tags)
unique_tags = list(my_set)
print(len(unique_tags))

134101
6384


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  filtered_questions['Tags'] = filtered_questions['Tags'].apply(lambda x: x.split())


In [18]:
flat_list = [item for sublist in filtered_questions['Tags'].values for item in sublist]

keywords = nltk.FreqDist(flat_list)

keywords = nltk.FreqDist(keywords)

frequencies_words = keywords.most_common(100)
tags_features = [word[0] for word in frequencies_words]

In [19]:
tags_features

['python',
 'django',
 'numpy',
 'matplotlib',
 'pandas',
 'python-3.x',
 'python-2.7',
 'list',
 'string',
 'flask',
 'dictionary',
 'scipy',
 'regex',
 'performance',
 'google-app-engine',
 'sqlalchemy',
 'arrays',
 'pip',
 'windows',
 'algorithm',
 'unit-testing',
 'linux',
 'unicode',
 'multithreading',
 'django-models',
 'osx',
 'json',
 'datetime',
 'c++',
 'mysql',
 'virtualenv',
 'multiprocessing',
 'subprocess',
 'class',
 'java',
 'c',
 'ipython',
 'file',
 'csv',
 'logging',
 'exception',
 'opencv',
 'sorting',
 'selenium',
 'tkinter',
 'python-imaging-library',
 'javascript',
 'module',
 'celery',
 'function',
 'import',
 'parsing',
 'cython',
 'scikit-learn',
 'math',
 'xml',
 'dataframe',
 'html',
 'pycharm',
 'machine-learning',
 'beautifulsoup',
 'plot',
 'list-comprehension',
 'generator',
 'django-admin',
 'pyqt',
 'postgresql',
 'debugging',
 'nltk',
 'tuples',
 'file-io',
 'urllib2',
 'oop',
 'setuptools',
 'sockets',
 'decorator',
 'image-processing',
 'image',
 'r

In [20]:
def most_common(tags):
    tags_filtered = []
    for i in range(0, len(tags)):
        if tags[i] in tags_features:
            tags_filtered.append(tags[i])
    return tags_filtered

In [21]:
filtered_questions['Tags'] = filtered_questions['Tags'].apply(lambda x: most_common(x))
filtered_questions['Tags'] = filtered_questions['Tags'].apply(lambda x: x if len(x)>0 else None)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  filtered_questions['Tags'] = filtered_questions['Tags'].apply(lambda x: most_common(x))
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  filtered_questions['Tags'] = filtered_questions['Tags'].apply(lambda x: x if len(x)>0 else None)


In [22]:
filtered_questions.head()

Unnamed: 0,Title,Body,Tags
0,How can I find the full path to a font from it...,<p>I am using the Photoshop's javascript API t...,"[python, osx]"
1,Get a preview JPEG of a PDF on Windows?,<p>I have a cross-platform (Python) applicatio...,"[python, windows, image]"
2,Continuous Integration System for a Python Cod...,<p>I'm starting work on a hobby project with a...,[python]
3,cx_Oracle: How do I iterate over a result set?,<p>There are several ways to iterate over a re...,"[python, sql, database]"
4,Using 'in' to match an attribute of Python obj...,<p>I don't remember whether I was dreaming or ...,"[python, arrays]"


In [23]:
filtered_questions.shape

(42420, 3)

In [24]:
filtered_questions.dropna(subset=['Tags'], inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  filtered_questions.dropna(subset=['Tags'], inplace=True)


In [25]:
filtered_questions.shape

(42420, 3)

In [26]:
filtered_questions.head()

Unnamed: 0,Title,Body,Tags
0,How can I find the full path to a font from it...,<p>I am using the Photoshop's javascript API t...,"[python, osx]"
1,Get a preview JPEG of a PDF on Windows?,<p>I have a cross-platform (Python) applicatio...,"[python, windows, image]"
2,Continuous Integration System for a Python Cod...,<p>I'm starting work on a hobby project with a...,[python]
3,cx_Oracle: How do I iterate over a result set?,<p>There are several ways to iterate over a re...,"[python, sql, database]"
4,Using 'in' to match an attribute of Python obj...,<p>I don't remember whether I was dreaming or ...,"[python, arrays]"


In [27]:
filtered_questions['Body'] = filtered_questions['Body'].apply(lambda x: BeautifulSoup(x).get_text()) 

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  filtered_questions['Body'] = filtered_questions['Body'].apply(lambda x: BeautifulSoup(x).get_text())


In [28]:
filtered_questions['Body'][1]

'I have a cross-platform (Python) application which needs to generate a JPEG preview of the first page of a PDF.\nOn the Mac I am spawning sips.  Is there something similarly simple I can do on Windows?\n'

In [29]:
filtered_questions.head(10)

Unnamed: 0,Title,Body,Tags
0,How can I find the full path to a font from it...,I am using the Photoshop's javascript API to f...,"[python, osx]"
1,Get a preview JPEG of a PDF on Windows?,I have a cross-platform (Python) application w...,"[python, windows, image]"
2,Continuous Integration System for a Python Cod...,I'm starting work on a hobby project with a py...,[python]
3,cx_Oracle: How do I iterate over a result set?,There are several ways to iterate over a resul...,"[python, sql, database]"
4,Using 'in' to match an attribute of Python obj...,I don't remember whether I was dreaming or not...,"[python, arrays]"
5,Class views in Django,"Django view points to a function, which can be...","[python, django, oop]"
6,Python and MySQL,I can get Python to work with Postgresql but I...,"[python, mysql, postgresql]"
7,How do I use Python's itertools.groupby()?,I haven't been able to find an understandable ...,[python]
8,Adding a Method to an Existing Object Instance,I've read that it is possible to add a method ...,"[python, oop]"
9,How do you express binary literals in Python?,How do you express an integer as a binary numb...,[python]


In [30]:
X1 = filtered_questions['Body']
X2 = filtered_questions['Title']
y = filtered_questions['Tags']

In [31]:
vectorizer_X1 = TfidfVectorizer(analyzer = 'word',
                                       min_df=0.0,
                                       max_df = 1.0,
                                       strip_accents = None,
                                       encoding = 'utf-8', 
                                       preprocessor=None,
                                       token_pattern=r"(?u)\S\S+",
                                       max_features=1000)

vectorizer_X2 = TfidfVectorizer(analyzer = 'word',
                                       min_df=0.0,
                                       max_df = 1.0,
                                       strip_accents = None,
                                       encoding = 'utf-8', 
                                       preprocessor=None,
                                       token_pattern=r"(?u)\S\S+",
                                       max_features=1000)

In [32]:
X1_tfidf = vectorizer_X1.fit_transform(X1)
X2_tfidf = vectorizer_X2.fit_transform(X2)

In [33]:
X_tfidf = hstack([X1_tfidf,X2_tfidf])

In [34]:
multilabel_binarizer = MultiLabelBinarizer()
y_bin = multilabel_binarizer.fit_transform(y)

In [35]:
X_train, X_test, y_train, y_test = train_test_split(X_tfidf, y_bin, test_size = 0.2, random_state = 0)

In [36]:
def avg_jacard(y_true,y_pred):
    jacard = np.minimum(y_true,y_pred).sum(axis=1) / np.maximum(y_true,y_pred).sum(axis=1)
    
    return jacard.mean()*100

def print_score(y_pred, clf):
    print("Clf: ", clf.__class__.__name__)
    print("Jacard score: {}".format(avg_jacard(y_test, y_pred)))
    print("Hamming loss: {}".format(hamming_loss(y_pred, y_test)*100))
    print("---") 

In [37]:
sgd = SGDClassifier()
clf = OneVsRestClassifier(sgd)
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)
print_score(y_pred, sgd)



Clf:  SGDClassifier
Jacard score: 75.72882288228821
Hamming loss: 0.6324846770391325
---


In [38]:
for i in range(y_train.shape[1]):
    print(multilabel_binarizer.classes_[i])
    print(confusion_matrix(y_test[:,i], y_pred[:,i]))
    print("")

algorithm
[[8388    2]
 [  86    8]]

argparse
[[8455    0]
 [   7   22]]

arrays
[[8397    0]
 [  87    0]]

beautifulsoup
[[8429    1]
 [  30   24]]

c
[[8407    0]
 [  77    0]]

c++
[[8403    7]
 [  49   25]]

celery
[[8428    1]
 [   7   48]]

class
[[8419    0]
 [  65    0]]

csv
[[8418    8]
 [  22   36]]

cython
[[8437    1]
 [  14   32]]

database
[[8446    0]
 [  38    0]]

dataframe
[[8429    0]
 [  55    0]]

datetime
[[8394    8]
 [  57   25]]

debugging
[[8443    0]
 [  40    1]]

decorator
[[8426   10]
 [  18   30]]

dictionary
[[8286   41]
 [  59   98]]

django
[[7710   27]
 [ 158  589]]

django-admin
[[8422   10]
 [  28   24]]

django-models
[[8404    0]
 [  80    0]]

django-templates
[[8451    0]
 [  33    0]]

exception
[[8423    1]
 [  60    0]]

file
[[8427    0]
 [  57    0]]

file-io
[[8445    0]
 [  39    0]]

flask
[[8311    4]
 [  61  108]]

function
[[8437    0]
 [  47    0]]

generator
[[8439    9]
 [  24   12]]

google-app-engine
[[8363    5]
 [  56   60]]

In [39]:
samq = '''In a multilabel classification problem, i use MultiLabelBinarizer to transform my 20 text labels into a binary list of zeros and ones.

After prediction I get my list of 20 binary values, and I would like to output the corresponding text labels.

I am just wondering whether MultiLabelBinarizer() provides a getting back transformation or I should do it manually.


'''

In [40]:
print(samq)

In a multilabel classification problem, i use MultiLabelBinarizer to transform my 20 text labels into a binary list of zeros and ones.

After prediction I get my list of 20 binary values, and I would like to output the corresponding text labels.

I am just wondering whether MultiLabelBinarizer() provides a getting back transformation or I should do it manually.





In [41]:
samq = BeautifulSoup(samq).get_text()

In [45]:
samq

'In a multilabel classification problem, i use MultiLabelBinarizer to transform my 20 text labels into a binary list of zeros and ones.\n\nAfter prediction I get my list of 20 binary values, and I would like to output the corresponding text labels.\n\nI am just wondering whether MultiLabelBinarizer() provides a getting back transformation or I should do it manually.'

In [44]:
samq = clean_text(samq)
samq = clean_punct(samq)

NameError: name 'clean_punct' is not defined

In [46]:
samq = lemitizeWords(samq)
samq = stopWordsRemove(samq)

NameError: name 'lemitizeWords' is not defined

In [47]:
samq

'In a multilabel classification problem, i use MultiLabelBinarizer to transform my 20 text labels into a binary list of zeros and ones.\n\nAfter prediction I get my list of 20 binary values, and I would like to output the corresponding text labels.\n\nI am just wondering whether MultiLabelBinarizer() provides a getting back transformation or I should do it manually.'

In [48]:
samq = vectorizer_X1.transform([samq])

In [49]:
samq

<1x1000 sparse matrix of type '<class 'numpy.float64'>'
	with 27 stored elements in Compressed Sparse Row format>

In [None]:
samt = 'Scikit Learn Multilabel Classification, Getting back labels from MultiLabelBinarizer'
samt = clean_text(samt)
samt = clean_punct(samt)
samt = lemitizeWords(samt)
samt = stopWordsRemove(samt)
print(samt)

In [None]:
samt = vectorizer_X2.transform([samt])

In [None]:
samt

In [None]:
q = hstack([samq, samt])

In [None]:
multilabel_binarizer.inverse_transform(clf.predict(q))