In [19]:
import pandas as pd

# Load the .txt files for both languages
source_file = "/Users/ursamatjasec/Desktop/UNI/Software engineering/en-nl/OpenSubtitles.en-nl.en"
target_file = "/Users/ursamatjasec/Desktop/UNI/Software engineering/nl.txt"

# Read each file into a list of sentences
with open(source_file, 'r', encoding='utf-8') as f:
    source_sentences = f.readlines()

with open(target_file, 'r', encoding='utf-8') as f:
    target_sentences = f.readlines()

# Strip whitespace
source_sentences = [line.strip() for line in source_sentences]
target_sentences = [line.strip() for line in target_sentences]

# Ensure both lists are the same length by truncating the longer one
min_length = min(len(source_sentences), len(target_sentences))
source_sentences = source_sentences[:min_length]
target_sentences = target_sentences[:min_length]

# Create a DataFrame with both languages
df = pd.DataFrame({
    'Source Language': source_sentences,
    'Target Language': target_sentences
})

# Display the header of the DataFrame
print("Header:", df.columns.tolist())

# Display the first 5 rows of the DataFrame
print("First 5 rows:\n", df.head())

Header: ['Source Language', 'Target Language']
First 5 rows:
                                      Source Language  \
0                              Go in search of love.   
1                         That is almost proverbial.   
2  Couple of that question for example s if your ...   
3                              What has do you then?   
4  The curtains near and guards to someone contra...   

                           Target Language  
0  Hoi Billy, wil je nog wat trucjes doen?  
1                             Ik kan niet.  
2                            Ik moet gaan.  
3                            Kom, nog één.  
4     Ik zal zaterdag op stap gaan met je.  


In [20]:
# Check volume by displaying the number of rows and columns
num_rows, num_columns = df.shape
print(f"Volume of Data - Number of Sentences: {num_rows}, Number of Languages: {num_columns}")

Volume of Data - Number of Sentences: 299879, Number of Languages: 2


In [21]:
# Check if both 'Source Language' and 'Target Language' columns exist
if 'Source Language' in df.columns and 'Target Language' in df.columns:
    # Calculate sentence lengths (number of characters or words)
    df['source_length'] = df['Source Language'].apply(len)  # Using character length
    df['target_length'] = df['Target Language'].apply(len)
    
    # Calculate the difference in length between source and target sentences
    df['length_diff'] = df['source_length'] - df['target_length']
    
    # Calculate average difference in length
    avg_length_diff = df['length_diff'].mean()
    print(f"Average difference in sentence length: {avg_length_diff} characters")
else:
    print("Source or Target language column not found for analysis.")

Average difference in sentence length: 0.0384555103891903 characters


In [22]:
# Check the data types of each column to analyze variety
print("Variety of Data - Data Types of Features:")
print(df.dtypes)

# Count categorical and numerical columns
num_categorical = df.select_dtypes(include=['object']).shape[1]  # Text-based data
num_numerical = df.select_dtypes(include=['int64', 'float64']).shape[1]  # Numerical data (e.g., sentence lengths)
print(f"Number of Categorical Features: {num_categorical}, Number of Numerical Features: {num_numerical}")

Variety of Data - Data Types of Features:
Source Language    object
Target Language    object
source_length       int64
target_length       int64
length_diff         int64
dtype: object
Number of Categorical Features: 2, Number of Numerical Features: 3


In [23]:
# Check the summary statistics of numerical columns (e.g., sentence lengths, length differences)
summary_statistics = df.describe()
print("Summary Statistics of Numerical Features:")
print(summary_statistics)

# Check the unique values in categorical columns (source and target sentences)
unique_values = df.select_dtypes(include=['object']).nunique()
print("Variety Analysis - Unique Values in Categorical Features:")
print(unique_values)

Summary Statistics of Numerical Features:
       source_length  target_length    length_diff
count  299879.000000  299879.000000  299879.000000
mean       29.791696      29.753240       0.038456
std        24.917848      23.929489      34.625893
min         1.000000       0.000000    -762.000000
25%        14.000000      14.000000     -16.000000
50%        24.000000      24.000000       0.000000
75%        38.000000      38.000000      16.000000
max      1218.000000     773.000000    1176.000000
Variety Analysis - Unique Values in Categorical Features:
Source Language    242468
Target Language    213374
dtype: int64


In [24]:
# Check for missing values in the dataset
missing_values = df.isnull().sum()
print("Veracity Analysis - Missing Values in Each Feature:")
print(missing_values)

# Check for duplicated rows
num_duplicates = df.duplicated().sum()
print(f"Number of Duplicated Rows: {num_duplicates}")

Veracity Analysis - Missing Values in Each Feature:
Source Language    0
Target Language    0
source_length      0
target_length      0
length_diff        0
dtype: int64
Number of Duplicated Rows: 196
