In [114]:
import numpy as np
import pandas as pd

In [115]:
entity_type_dict = {
    'geo': 0,
    'tim': 0,
    'org': 0,
    'per': 0,
    'art': 0,
    'nat': 0,
    'gpe': 0,
    'eve': 0
}
sentence_dict = {}

### 1. Read the dataset from csv file. error_bad_lines = False, because otherwise there is a ParserError (missing columns) for one line (#281837)

In [116]:
df = pd.read_csv("ner.csv", encoding = "ISO-8859-1", error_bad_lines=False)

b'Skipping line 281837: expected 25 fields, saw 34\n'


In [117]:
df.head()

Unnamed: 0.1,Unnamed: 0,lemma,next-lemma,next-next-lemma,next-next-pos,next-next-shape,next-next-word,next-pos,next-shape,next-word,...,prev-prev-lemma,prev-prev-pos,prev-prev-shape,prev-prev-word,prev-shape,prev-word,sentence_idx,shape,word,tag
0,0,thousand,of,demonstr,NNS,lowercase,demonstrators,IN,lowercase,of,...,__start2__,__START2__,wildcard,__START2__,wildcard,__START1__,1.0,capitalized,Thousands,O
1,1,of,demonstr,have,VBP,lowercase,have,NNS,lowercase,demonstrators,...,__start1__,__START1__,wildcard,__START1__,capitalized,Thousands,1.0,lowercase,of,O
2,2,demonstr,have,march,VBN,lowercase,marched,VBP,lowercase,have,...,thousand,NNS,capitalized,Thousands,lowercase,of,1.0,lowercase,demonstrators,O
3,3,have,march,through,IN,lowercase,through,VBN,lowercase,marched,...,of,IN,lowercase,of,lowercase,demonstrators,1.0,lowercase,have,O
4,4,march,through,london,NNP,capitalized,London,IN,lowercase,through,...,demonstr,NNS,lowercase,demonstrators,lowercase,have,1.0,lowercase,marched,O


In [127]:
df = df.drop(columns=['Unnamed: 0'])

In [128]:
print('Total number of rows: ', len(df))

Total number of rows:  1050795


#### Every column except the sentence index contains string values.

In [129]:
print('Datatypes:\n', df.dtypes)

Datatypes:
 lemma               object
next-lemma          object
next-next-lemma     object
next-next-pos       object
next-next-shape     object
next-next-word      object
next-pos            object
next-shape          object
next-word           object
pos                 object
prev-iob            object
prev-lemma          object
prev-pos            object
prev-prev-iob       object
prev-prev-lemma     object
prev-prev-pos       object
prev-prev-shape     object
prev-prev-word      object
prev-shape          object
prev-word           object
sentence_idx       float64
shape               object
word                object
tag                 object
dtype: object


#### Check for duplicate rows. If there are any, we will drop them

In [132]:
len(df[df.duplicated() == True])

281839

In [133]:
df = df.drop_duplicates(keep='first') 

#### Number of rows after deleting duplicates

In [134]:
len(df)

768956

### 2. 25 columns in total. Explore the types and check for NaN values.

In [136]:
cols = df.columns.tolist()
print('Columns in the dataset: \n', cols)
print('Total number of columns:', len(cols))

Columns in the dataset: 
 ['lemma', 'next-lemma', 'next-next-lemma', 'next-next-pos', 'next-next-shape', 'next-next-word', 'next-pos', 'next-shape', 'next-word', 'pos', 'prev-iob', 'prev-lemma', 'prev-pos', 'prev-prev-iob', 'prev-prev-lemma', 'prev-prev-pos', 'prev-prev-shape', 'prev-prev-word', 'prev-shape', 'prev-word', 'sentence_idx', 'shape', 'word', 'tag']
Total number of columns: 24


 #### Let's check if there are NaN values in 'tag'

In [137]:
df['tag'].isnull().values.any()

True

In [138]:
df[df['tag'].isnull() == True]

Unnamed: 0,lemma,next-lemma,next-next-lemma,next-next-pos,next-next-shape,next-next-word,next-pos,next-shape,next-word,pos,...,prev-prev-lemma,prev-prev-pos,prev-prev-shape,prev-prev-word,prev-shape,prev-word,sentence_idx,shape,word,tag
689433,domin,countri,.,.,punct,.nister,O,,,,...,,,,,,,,,,


#### Since there is only one line which is clearly missing most of the other information, we are going to drop it.

In [139]:
df = df.drop(index = 689433)

#### Let's check the NaN values in the entire DataFrame. There aren't any, so we don't need to think about dealing with this issue.

In [140]:
df.isnull().sum()

lemma              0
next-lemma         0
next-next-lemma    0
next-next-pos      0
next-next-shape    0
next-next-word     0
next-pos           0
next-shape         0
next-word          0
pos                0
prev-iob           0
prev-lemma         0
prev-pos           0
prev-prev-iob      0
prev-prev-lemma    0
prev-prev-pos      0
prev-prev-shape    0
prev-prev-word     0
prev-shape         0
prev-word          0
sentence_idx       0
shape              0
word               0
tag                0
dtype: int64

#### How many different words  appear in the dataset:

In [141]:
df['word'].nunique()

30172

### 3. Explore the 'tag' column.

In [142]:
print('Total different types of column names:', df['tag'].nunique())
print('Value counts of \'tag\':\n ', df['tag'].value_counts())

Total different types of column names: 17
Value counts of 'tag':
  O        651237
B-geo     27582
B-tim     14854
B-org     14846
I-per     12594
B-per     12397
I-org     12290
B-gpe     11834
I-geo      5387
I-tim      4663
B-art       290
B-eve       233
I-eve       200
I-art       188
B-nat       158
I-gpe       155
I-nat        47
Name: tag, dtype: int64


#### How many values of each type of entity are there: 

In [143]:
for entity in entity_type_dict.keys():
    entity_type_dict[entity] += len(df[df['tag'].str.contains(entity)])

In [144]:
for entity in sorted(entity_type_dict, key=entity_type_dict.get, reverse=True):
    print("Entity: ", entity, "Total: ", entity_type_dict[entity])

Entity:  geo Total:  32969
Entity:  org Total:  27136
Entity:  per Total:  24991
Entity:  tim Total:  19517
Entity:  gpe Total:  11989
Entity:  art Total:  478
Entity:  eve Total:  433
Entity:  nat Total:  205


### 4. Merging the given words into dictionary with sentences, ready for future usage

In [148]:
def get_sentence(sentence_group):
    s_idx = sentence_group['sentence_idx'].iloc[0]
    words = sentence_group['word'].tolist()
    pos = sentence_group['pos'].tolist()
    lemmas = sentence_group['lemma'].tolist()
    tags = sentence_group['tag'].tolist()
    lst = list(zip(words, pos, lemmas, tags))
    sentence_dict[s_idx] = lst
    return lst

In [149]:
def write_sentence(sentence):
    words = []
    for tup in sentence:
        words.append(tup[0])
    print (' '.join(words))
        

#### For further usage we will probably keep only these four columns: 'sentence_idx', 'pos', 'word', 'tag'.

In [151]:
df_grouped = df[['sentence_idx', 'lemma', 'pos', 'word', 'tag']]
df_grouped = df_grouped.groupby('sentence_idx')

#### Each sentence is written into list. Each word and punctuation belong to a tuple together with the corresponding lemma, POS tag and NE tag.

In [153]:
for s_group in df_grouped:
    get_sentence(s_group[1])

In [154]:
print('Total number of sentences:',len(sentence_dict))

Total number of sentences: 35177


#### Example of a sentence in a dictionary:

In [155]:
sentence_dict[1]

[('Thousands', 'NNS', 'thousand', 'O'),
 ('of', 'IN', 'of', 'O'),
 ('demonstrators', 'NNS', 'demonstr', 'O'),
 ('have', 'VBP', 'have', 'O'),
 ('marched', 'VBN', 'march', 'O'),
 ('through', 'IN', 'through', 'O'),
 ('London', 'NNP', 'london', 'B-geo'),
 ('to', 'TO', 'to', 'O'),
 ('protest', 'VB', 'protest', 'O'),
 ('the', 'DT', 'the', 'O'),
 ('war', 'NN', 'war', 'O'),
 ('in', 'IN', 'in', 'O'),
 ('Iraq', 'NNP', 'iraq', 'B-geo'),
 ('and', 'CC', 'and', 'O'),
 ('demand', 'VB', 'demand', 'O'),
 ('the', 'DT', 'the', 'O'),
 ('withdrawal', 'NN', 'withdraw', 'O'),
 ('of', 'IN', 'of', 'O'),
 ('British', 'JJ', 'british', 'B-gpe'),
 ('troops', 'NNS', 'troop', 'O'),
 ('from', 'IN', 'from', 'O'),
 ('that', 'DT', 'that', 'O'),
 ('country', 'NN', 'countri', 'O'),
 ('.', '.', '.', 'O')]

In [156]:
write_sentence(sentence_dict[2])

Families of soldiers killed in the conflict joined the protesters who carried banners with such slogans as " Bush Number One Terrorist " and " Stop the Bombings . "
