# Klasifikacija teksta

U ovom primjeru demonstrirat ćemo algoritam klasifikacije muško ženskih imena.

Primjeri engleskih imena:

```
name         label
______________________
Alice           F
John            M
Martha          F
Joanna          F
Peter           M
...

```

In [1]:
#izvlačenje značajki roda iz imena
def gender_features(word):
    return {'last_letter' : word[-1]}

In [2]:
import tkinter
import nltk
import random


In [4]:
#preuzmi nltk podatke 'names'
nltk.download('names')

[nltk_data] Downloading package names to
[nltk_data]     C:\Users\student\AppData\Roaming\nltk_data...
[nltk_data]   Package names is already up-to-date!


True

In [5]:
gender_features('Alice')

{'last_letter': 'e'}

In [6]:
#učitajte podatke o imenima
from nltk.corpus import names

names.words('female.txt')[:10] #prvih 10 imena iz skupa podataka


['Abagael',
 'Abagail',
 'Abbe',
 'Abbey',
 'Abbi',
 'Abbie',
 'Abby',
 'Abigael',
 'Abigail',
 'Abigale']

In [7]:
#definirajmo označeni skup podataka
names_data = [(name, 'male') for name in names.words('male.txt')] + [(name, 'female') for name in names.words('female.txt')]
#permutirajmo imena s oznakama
random.shuffle(names_data)
names_data[:10]

[('Blancha', 'female'),
 ('Bud', 'male'),
 ('Tilda', 'female'),
 ('Britaney', 'female'),
 ('Rafa', 'female'),
 ('Andrea', 'female'),
 ('Wiatt', 'male'),
 ('Gerda', 'female'),
 ('Roseann', 'female'),
 ('Cassey', 'female')]

In [None]:
# izgradimo skup podataka na temelju značajki 

feature_set = [(gender_features(name), g) for (name,g) in names_data]

#podjela na skup za treniranje i testiranje

size= int (0.8 * len(feature_set))
train_set, test_set = feature_set[:size], feature_set[size:]

train_set[:5]

[({'last_letter': 'a'}, 'female'),
 ({'last_letter': 'd'}, 'male'),
 ({'last_letter': 'a'}, 'female'),
 ({'last_letter': 'y'}, 'female'),
 ({'last_letter': 'a'}, 'female')]

In [11]:
#definicija naivnog bays kao klasifikatora te poziv na treniraje
clasifier = nltk.NaiveBayesClassifier.train(train_set)

In [14]:
#provjeri klasifikaciju na novim podacima
#primjer Mark
clasifier.classify(gender_features('Mark'))
#primjer Ann-Marie
clasifier.classify(gender_features('Ann-Marie'))

#primjer Kathleen
clasifier.classify(gender_features('Kathleen'))

'male'

In [15]:
#evaluacija modela
nltk.classify.accuracy(clasifier, test_set)

0.7482693517935809

In [17]:
# informativne značajke modela
clasifier.show_most_informative_features(10)

Most Informative Features
             last_letter = 'a'            female : male   =     39.5 : 1.0
             last_letter = 'k'              male : female =     29.1 : 1.0
             last_letter = 'f'              male : female =     25.3 : 1.0
             last_letter = 'p'              male : female =     12.5 : 1.0
             last_letter = 'd'              male : female =     10.1 : 1.0
             last_letter = 'o'              male : female =      9.6 : 1.0
             last_letter = 'm'              male : female =      7.7 : 1.0
             last_letter = 'r'              male : female =      7.2 : 1.0
             last_letter = 'z'              male : female =      5.7 : 1.0
             last_letter = 'w'              male : female =      5.4 : 1.0


In [20]:
# idemo probati dizajnirati boljeizvlačenje značajki

def gender_features2(name):
    return { 'sufix1' : name[-1:],
            'sufix2'  : name [-2:]
        
    }

In [19]:
#test
gender_features2('Kathleen')

{'sufix1': 'n', 'sufix2': 'een'}

In [35]:
# izgradimo skup podataka na temelju značajki 

feature_set2 = [(gender_features2(name), g) for (name,g) in names_data]

#podjela na skup za treniranje i testiranje

size= int (0.8 * len(feature_set2))
train_set, test_set = feature_set2[:size], feature_set2[size:]

train_set[:5]

[({'sufix1': 'a', 'sufix2': 'ha'}, 'female'),
 ({'sufix1': 'd', 'sufix2': 'ud'}, 'male'),
 ({'sufix1': 'a', 'sufix2': 'da'}, 'female'),
 ({'sufix1': 'y', 'sufix2': 'ey'}, 'female'),
 ({'sufix1': 'a', 'sufix2': 'fa'}, 'female')]

In [36]:
#definicija naivnog bays kao klasifikatora te poziv na treniraje
clasifier = nltk.NaiveBayesClassifier.train(train_set)

In [37]:
#evaluacija modela
nltk.classify.accuracy(clasifier, test_set)

0.7595972309628697

In [38]:
# informativne značajke modela
clasifier.show_most_informative_features(10)

Most Informative Features
                  sufix2 = 'na'           female : male   =     88.6 : 1.0
                  sufix2 = 'rt'             male : female =     50.1 : 1.0
                  sufix2 = 'ia'           female : male   =     46.2 : 1.0
                  sufix1 = 'a'            female : male   =     39.5 : 1.0
                  sufix1 = 'k'              male : female =     29.1 : 1.0
                  sufix2 = 'rd'             male : female =     28.1 : 1.0
                  sufix2 = 'us'             male : female =     27.6 : 1.0
                  sufix1 = 'f'              male : female =     25.3 : 1.0
                  sufix2 = 'ra'           female : male   =     24.0 : 1.0
                  sufix2 = 'io'             male : female =     23.7 : 1.0


In [40]:
#pronađimo odstupanja

##... to be continued

In [49]:
def gender_features3(name):
    return { 'sufix1' : name[-1:].lower(),
            'sufix2'  : name [-2:].lower(),
            'sufix3' : name[0].lower(),
            'sufix4'  : name [:2].lower(),
            # 'lenght' : len(name)
    }

In [46]:
# izgradimo skup podataka na temelju značajki 

feature_set3 = [(gender_features3(name), g) for (name,g) in names_data]

#podjela na skup za treniranje i testiranje

size= int (0.8 * len(feature_set3))
train_set, test_set = feature_set3[:size], feature_set3[size:]

train_set[:5]

[({'sufix1': 'a', 'sufix2': 'ha', 'sufix3': 'b', 'sufix4': 'bl', 'lenght': 7},
  'female'),
 ({'sufix1': 'd', 'sufix2': 'ud', 'sufix3': 'b', 'sufix4': 'bu', 'lenght': 3},
  'male'),
 ({'sufix1': 'a', 'sufix2': 'da', 'sufix3': 't', 'sufix4': 'ti', 'lenght': 5},
  'female'),
 ({'sufix1': 'y', 'sufix2': 'ey', 'sufix3': 'b', 'sufix4': 'br', 'lenght': 8},
  'female'),
 ({'sufix1': 'a', 'sufix2': 'fa', 'sufix3': 'r', 'sufix4': 'ra', 'lenght': 4},
  'female')]

In [52]:
gender_features3('Kathleen')

{'sufix1': 'n', 'sufix2': 'en', 'sufix3': 'k', 'sufix4': 'ka'}

In [47]:
#definicija naivnog bays kao klasifikatora te poziv na treniraje
clasifier = nltk.NaiveBayesClassifier.train(train_set)

In [48]:
#evaluacija modela
nltk.classify.accuracy(clasifier, test_set)

0.788546255506608

In [51]:
clasifier.classify(gender_features3('Katarina'))

'female'