# ANALYSE

In [68]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

import math

Produce a smaller dataset with only English, German and French (no other languages necessary for our purpose, can easily be extended)

In [69]:
languages = ["English", "German", "French"]

df = pd.read_csv('Data/languages.csv')

reduced_df = df[df['Language'].isin(languages)]
reduced_df.to_csv('Data/minimal_languages.csv', index=False)

Export french only into single csv

In [70]:
french_df = df[df['Language'].isin(["French"])]
french_df.to_csv('Data/french.csv', index=False)

Export german only into single csv

In [71]:
german_df = df[df['Language'].isin(["German"])]
german_df.to_csv('Data/german.csv', index=False)

Export english only into single csv

In [72]:
english_df = df[df['Language'].isin(["English"])]
english_df.to_csv('Data/english.csv', index=False)

Common french statistics regarding data frame

In [73]:
french_df = pd.read_csv('Data/french.csv', encoding='utf-8')

fr_array_of_word_arrays = [word.replace(u'\xa0', u' ').split(" ") for word in french_df['Text']]
fr_number_of_sentences = len(fr_array_of_word_arrays)
fr_number_of_words = sum([len(item) for item in fr_array_of_word_arrays])
fr_number_of_letters = sum(sum([len(word) for word in sentence]) for sentence in fr_array_of_word_arrays)
fr_average_word_length = fr_number_of_letters / fr_number_of_words

print(fr_number_of_sentences)
print(fr_number_of_words)
print(fr_number_of_letters)
print(fr_average_word_length)

1014
23260
124921
5.370636285468616


Common english statistics regarding data frame

In [74]:
english_df = pd.read_csv('Data/english.csv', encoding='utf-8')

eng_array_of_word_arrays = [word.replace(u'\xa0', u' ').split(" ") for word in english_df['Text']]
eng_number_of_sentences = len(eng_array_of_word_arrays)
eng_number_of_words = sum([len(item) for item in eng_array_of_word_arrays])
eng_number_of_letters = sum(sum([len(word) for word in sentence]) for sentence in eng_array_of_word_arrays)
eng_average_word_length = eng_number_of_letters / eng_number_of_words

print(eng_number_of_sentences)
print(eng_number_of_words)
print(eng_number_of_letters)
print(eng_average_word_length)

1385
30382
160611
5.286386676321506


Common german statistics regarding data frame

In [75]:
german_df = pd.read_csv('Data/german.csv', encoding='utf-8')

ger_array_of_word_arrays = [word.replace(u'\xa0', u' ').split(" ") for word in german_df['Text']]
ger_number_of_sentences = len(ger_array_of_word_arrays)
ger_number_of_words = sum([len(item) for item in ger_array_of_word_arrays])
ger_number_of_letters = sum(sum([len(word) for word in sentence]) for sentence in ger_array_of_word_arrays)
ger_average_word_length = ger_number_of_letters / ger_number_of_words

print(ger_number_of_sentences)
print(ger_number_of_words)
print(ger_number_of_letters)
print(ger_average_word_length)

470
5927
29653
5.003036949552894


## Analysis of typical attributes, allowing direct identification of a language

### Special character (french specific)

In [76]:
fr_special_char = ["é", "è", "ê", "à", "â", "ù", "ô", "ç"]

fr_occurence = 0

for sentence in fr_array_of_word_arrays:
    for word in sentence:
        for char in word:
            if char.lower() in fr_special_char:
                fr_occurence += 1

fr_occurence_rate = fr_occurence / fr_number_of_letters

print(fr_occurence)
print(f"{round(fr_occurence_rate*100, 2)} %")

4138
3.31 %


Analyse occurence rate of special character in german and english (should be very low for attribute to have high entropy value)

In [77]:
english_df = pd.read_csv('Data/english.csv', encoding='utf-8')
german_df = pd.read_csv('Data/german.csv', encoding='utf-8')

eng_occurence = 0
ger_occurence = 0

for sentence in eng_array_of_word_arrays:
    for word in sentence:
        for char in word:
            if char.lower() in fr_special_char:
                eng_occurence += 1

for sentence in ger_array_of_word_arrays:
    for word in sentence:
        for char in word:
            if char.lower() in fr_special_char:
                ger_occurence += 1

eng_occurence_rate = eng_occurence / eng_number_of_letters
ger_occurence_rate = ger_occurence / ger_number_of_letters

print("English")
print(eng_occurence)
print(f"{round(eng_occurence_rate*100, 2)} %\n")

print("German")
print(ger_occurence)
print(f"{round(ger_occurence_rate*100, 2)} %")

English
5
0.0 %

German
0
0.0 %


Very low values (5 for english and 0 for german), therefore the attribute "contains_spec_char" is a good indication if it is french (not containing does not imply that it is not french though, as occurence rate is quite low ~3.28% only)

### Special character (german specific)

In [78]:
ger_special_char = ["ä", "ö", "ü", "ß"]

ger_occurence = 0

for sentence in ger_array_of_word_arrays:
    for word in sentence:
        for char in word:
            if char.lower() in ger_special_char:
                ger_occurence += 1

ger_occurence_rate = ger_occurence / ger_number_of_letters

print(ger_occurence)
print(f"{round(ger_occurence_rate*100, 2)} %")

490
1.65 %


Analyse occurence rate of special character in french and english (should be very low for attribute to have high entropy value)

In [79]:
english_df = pd.read_csv('Data/english.csv', encoding='utf-8')
french_df = pd.read_csv('Data/german.csv', encoding='utf-8')

eng_occurence = 0
fr_occurence = 0

for sentence in eng_array_of_word_arrays:
    for word in sentence:
        for char in word:
            if char.lower() in ger_special_char:
                eng_occurence += 1

for sentence in fr_array_of_word_arrays:
    for word in sentence:
        for char in word:
            if char.lower() in ger_special_char:
                fr_occurence += 1

eng_occurence_rate = eng_occurence / eng_number_of_letters
fr_occurence_rate = fr_occurence / fr_number_of_letters

print("English")
print(eng_occurence)
print(f"{round(eng_occurence_rate*100, 2)} %\n")

print("French")
print(fr_occurence)
print(f"{round(fr_occurence_rate*100, 2)} %")

English
3
0.0 %

French
2
0.0 %


The english and french language only have a very minimalistic usage of the german "special" character (3 for english and 2 for french), therefore an occurence of one of the german special characters are a very good indication, that the language is german.

We can conclude, that the usage of one of the french special characters (["é", "è", "ê", "à", "â", "ù", "ô", "ç"]) or the usage of one of the german special characters (["ä", "ö", "ü", "ß"]), both non case sensitive, are a very good indicator that the language is french/german (depending on the character).

The french special character only had an occurence rate of ~3.28 %, the german special character only 1.65%.
Therefore, we clearly need further attributes to determine which language a text sample is, in case no special characters occur (either intentionally or unintentionally).

## More global language analysis (no direct identification)