In [1]:
import pandas as pd 

In [2]:
IDF = pd.read_csv('idf_report.csv')

In [3]:
IDF.tail() #I want to start from 7th oct. 

Unnamed: 0,Date,Time,Location,Description
176,07/10/2023,,Unknown,Sirens sounded in the city of Jerusalem.
177,07/10/2023,,Israel,Sirens continue to sound throughout Central an...
178,07/10/2023,,Gaza,Initial report – A number of terrorists have i...
179,07/10/2023,,Gaza,"Over the past half an hour, barrages of rocket..."
180,07/10/2023,,Israel,Sirens sounded in Central and Southern Israel....


There is NaN in time. And the "Unknown" is not counted, even in the first-last observation it's "Jerusalem" 

In [5]:
print(IDF.isnull().sum())


Date            0
Time           85
Location        0
Description     0
dtype: int64


There is one Duplicate 

In [7]:
print(IDF.duplicated().sum())


1


In [8]:
IDF.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 181 entries, 0 to 180
Data columns (total 4 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   Date         181 non-null    object
 1   Time         96 non-null     object
 2   Location     181 non-null    object
 3   Description  181 non-null    object
dtypes: object(4)
memory usage: 5.8+ KB


So we have 181, and the 96 out of 181 wich is the missing Values (85)
all of them are objects, the time and Date should be in datetime. 

In [10]:
IDF.shape #alright. 

(181, 4)

In [11]:
IDF['Description'].duplicated(). sum() #I think it's weird to have the same description. 

11

In [12]:
IDF['Date'].duplicated(). sum()

173

In [13]:
IDF['Time'].duplicated(). sum()

91

In [14]:
IDF['Location'].duplicated(). sum()

177

In [15]:
IDF['Location'].value_counts

<bound method IndexOpsMixin.value_counts of 0         Gaza
1         Gaza
2      Lebanon
3      Unknown
4       Israel
        ...   
176    Unknown
177     Israel
178       Gaza
179       Gaza
180     Israel
Name: Location, Length: 181, dtype: object>

IDF['Date'] = pd.to_datetime(IDF['Date'], errors='coerce')
IDF['Time'] = pd.to_datetime(IDF['Time'], errors='coerce')

In [72]:
IDF.dtypes

Date                     datetime64[ns]
Time                     datetime64[ns]
Location                         object
Description                      object
Num_Terrorists                   object
Num_Damaged_Buildings            object
Ammunition_Used                  object
dtype: object

In [19]:
IDF =IDF.sort_values(by='Date').reset_index(drop=True)

In [96]:
IDF.head()

NameError: name 'IDF' is not defined

# Let's try to manipulate Texts! 

In [22]:
import re


In [23]:
def count_terrorists(description):
    matches = re.findall(r'(\d+)\s+terrorists', description.lower())
    return sum(int(match) for match in matches)

total_terrorists = IDF['Description'].apply(count_terrorists).sum()

print(f'Total number of terrorists mentioned: {total_terrorists}')

Total number of terrorists mentioned: 23


In [24]:
def count_hamas(description):
    matches = re.findall(r'(\d+)\s+hamas', description.lower())
    return sum(int(match) for match in matches)
total_hamas = IDF['Description'].apply(count_hamas).sum()

print(f'Total number of Hamas mentioned: {total_hamas}')

Total number of Hamas mentioned: 507


okay that was warming up. lets make the "description" column better (number of terrorist, number of  damage buildings, Ammunition used(kind and number),

In [26]:
description_split = IDF['Description'].str.extract(r'(?P<Num_Terrorists>\d+) terrorists, (?P<Num_Damaged_Buildings>\d+) damaged buildings, Ammunition used: (?P<Ammunition_Used>.+)', expand=True)
IDF = pd.concat([IDF, description_split], axis=1)


In [27]:
IDF.head()

Unnamed: 0,Date,Time,Location,Description,Num_Terrorists,Num_Damaged_Buildings,Ammunition_Used
0,2023-07-10,NaT,Israel,Sirens sounded in Central and Southern Israel....,,,
1,2023-07-10,NaT,Unknown,"The Commanding Officer of the Nahal brigade, C...",,,
2,2023-07-10,NaT,Unknown,Attached is an IDF Announcement regarding the ...,,,
3,2023-07-10,NaT,Gaza,IDF fighter jets struck military infrastructur...,,,
4,2023-07-10,NaT,Gaza,The IDF is currently striking a number of targ...,,,


In [5]:
import pandas as pd
import nltk
from nltk.tokenize import sent_tokenize

# Download the Punkt tokenizer model
nltk.download('punkt')


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\hp\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [84]:

# Define functions
def extract_sentences(text):
    return sent_tokenize(text)

def assign_sentences(sentences):
    Num_terrorists = ""
    Num_damaged_buildings = ""
    Ammunition_used = ""
    
    keywords1 = {
        'Num_Terrorists': ['terrorist', 'attackers', 'militants', 'dozens of terrorists','dozens', 'terrorists were neutralized']}
    keywords2 = {
        'Num_Damaged_Buildings': ['damaged', 'destroyed','struck the offices']} 
    keywords3 = {
        'Ammunition_Used': ['ammunition', 'ammunitions', 'bullets', 'shells','launched toward', 'Ammunition', 'kilo']
    }
    
    for sentence in sentences:
        if any(keyword in sentence.lower() for keyword in keywords1['Num_Terrorists']):
            num_terrorists = sentence
        elif any(keyword in sentence.lower() for keyword in keywords2['Num_Damaged_Buildings']):
            num_damaged_buildings = sentence
        elif any(keyword in sentence.lower() for keyword in keywords3['Ammunition_Used']):
            ammunition_used = sentence
    
    return pd.Series([Num_terrorists, Num_damaged_buildings, Ammunition_used])


In [94]:
df.tail(8)

Unnamed: 0,Date,Time,Location,Description,Num_Terrorists,Num_Damaged_Buildings,Ammunition_Used
173,07/10/2023,,Gaza,Dozens of IDF fighter jets struck a number of ...,Dozens of IDF fighter jets struck a number of ...,,
174,07/10/2023,,Unknown,IDF operation against the Hamas terrorist orga...,IDF operation against the Hamas terrorist orga...,,
175,07/10/2023,,Gaza,The IDF declares a state of alert for war. ove...,The Hamas terrorist organization is responsibl...,,
176,07/10/2023,,Unknown,Sirens sounded in the city of Jerusalem.,,,
177,07/10/2023,,Israel,Sirens continue to sound throughout Central an...,,,
178,07/10/2023,,Gaza,Initial report – A number of terrorists have i...,Initial report – A number of terrorists have i...,,
179,07/10/2023,,Gaza,"Over the past half an hour, barrages of rocket...","Over the past half an hour, barrages of rocket...",,
180,07/10/2023,,Israel,Sirens sounded in Central and Southern Israel....,,,


In [108]:
df['Date'] = pd.to_datetime(df['Date'], errors='coerce')
df['Time'] = pd.to_datetime(df['Time'], errors='coerce')


  df['Date'] = pd.to_datetime(df['Date'], errors='coerce')
  df['Time'] = pd.to_datetime(df['Time'], errors='coerce')
